2
0
Эх сурвалжийг харах

Merge remote-tracking branch 'origin/master'

Philipp Hagemeister 11 жил өмнө
parent
commit
35f76e0061

+ 4 - 0
youtube_dl/extractor/__init__.py

@@ -130,6 +130,7 @@ from .helsinki import HelsinkiIE
 from .hentaistigma import HentaiStigmaIE
 from .hentaistigma import HentaiStigmaIE
 from .hotnewhiphop import HotNewHipHopIE
 from .hotnewhiphop import HotNewHipHopIE
 from .howcast import HowcastIE
 from .howcast import HowcastIE
+from .howstuffworks import HowStuffWorksIE
 from .huffpost import HuffPostIE
 from .huffpost import HuffPostIE
 from .hypem import HypemIE
 from .hypem import HypemIE
 from .iconosquare import IconosquareIE
 from .iconosquare import IconosquareIE
@@ -150,6 +151,7 @@ from .ivi import (
 from .izlesene import IzleseneIE
 from .izlesene import IzleseneIE
 from .jadorecettepub import JadoreCettePubIE
 from .jadorecettepub import JadoreCettePubIE
 from .jeuxvideo import JeuxVideoIE
 from .jeuxvideo import JeuxVideoIE
+from .jove import JoveIE
 from .jukebox import JukeboxIE
 from .jukebox import JukeboxIE
 from .justintv import JustinTVIE
 from .justintv import JustinTVIE
 from .jpopsukitv import JpopsukiIE
 from .jpopsukitv import JpopsukiIE
@@ -181,6 +183,7 @@ from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .metacritic import MetacriticIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
+from .mitele import MiTeleIE
 from .mixcloud import MixcloudIE
 from .mixcloud import MixcloudIE
 from .mlb import MLBIE
 from .mlb import MLBIE
 from .mpora import MporaIE
 from .mpora import MporaIE
@@ -255,6 +258,7 @@ from .ro220 import Ro220IE
 from .rottentomatoes import RottenTomatoesIE
 from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rtbf import RTBFIE
+from .rtlnl import RtlXlIE
 from .rtlnow import RTLnowIE
 from .rtlnow import RTLnowIE
 from .rts import RTSIE
 from .rts import RTSIE
 from .rtve import RTVEALaCartaIE
 from .rtve import RTVEALaCartaIE

+ 1 - 1
youtube_dl/extractor/dfb.py

@@ -30,7 +30,7 @@ class DFBIE(InfoExtractor):
             video_id)
             video_id)
         video_info = player_info.find('video')
         video_info = player_info.find('video')
 
 
-        f4m_info = self._download_xml(video_info.find('url').text, video_id)
+        f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
         token_el = f4m_info.find('token')
         token_el = f4m_info.find('token')
         manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
         manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
 
 

+ 7 - 0
youtube_dl/extractor/generic.py

@@ -706,6 +706,13 @@ class GenericIE(InfoExtractor):
             url = unescapeHTML(mobj.group('url'))
             url = unescapeHTML(mobj.group('url'))
             return self.url_result(url, ie='MTVServicesEmbedded')
             return self.url_result(url, ie='MTVServicesEmbedded')
 
 
+        # Look for embedded yahoo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Yahoo')
+
         # Start with something easy: JW Player in SWFObject
         # Start with something easy: JW Player in SWFObject
         found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if not found:
         if not found:

+ 134 - 0
youtube_dl/extractor/howstuffworks.py

@@ -0,0 +1,134 @@
+from __future__ import unicode_literals
+
+import re
+import json
+import random
+import string
+
+from .common import InfoExtractor
+from ..utils import find_xpath_attr
+
+
+class HowStuffWorksIE(InfoExtractor):
+    _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm'
+    _TESTS = [
+        {
+            'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
+            'info_dict': {
+                'id': '450221',
+                'display_id': 'cool-jobs-iditarod-musher',
+                'ext': 'flv',
+                'title': 'Cool Jobs - Iditarod Musher',
+                'description': 'md5:82bb58438a88027b8186a1fccb365f90',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                # md5 is not consistent
+                'skip_download': True
+            }
+        },
+        {
+            'url': 'http://adventure.howstuffworks.com/39516-deadliest-catch-jakes-farewell-pots-video.htm',
+            'info_dict': {
+                'id': '553470',
+                'display_id': 'deadliest-catch-jakes-farewell-pots',
+                'ext': 'mp4',
+                'title': 'Deadliest Catch: Jake\'s Farewell Pots',
+                'description': 'md5:9632c346d5e43ee238028c9cefd8dbbc',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                # md5 is not consistent
+                'skip_download': True
+            }
+        },
+        {
+            'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm',
+            'info_dict': {
+                'id': '440011',
+                'display_id': 'sword-swallowing-1-by-dan-meyer',
+                'ext': 'flv',
+                'title': 'Sword Swallowing #1 by Dan Meyer',
+                'description': 'md5:b2409e88172913e2e7d3d1159b0ef735',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                # md5 is not consistent
+                'skip_download': True
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        webpage = self._download_webpage(url, display_id)
+
+        content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id')
+
+        mp4 = self._search_regex(
+            r'''(?xs)var\s+clip\s*=\s*{\s*
+                .+?\s*
+                content_id\s*:\s*%s\s*,\s*
+                .+?\s*
+                mp4\s*:\s*\[(.*?),?\]\s*
+                };\s*
+                videoData\.push\(clip\);''' % content_id,
+            webpage, 'mp4', fatal=False, default=None)
+
+        smil = self._download_xml(
+            'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id,
+            content_id, 'Downloading video SMIL')
+
+        http_base = find_xpath_attr(
+            smil,
+            './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
+            'name',
+            'httpBase').get('content')
+
+        def random_string(str_len=0):
+            return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)])
+
+        URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12))
+
+        formats = []
+
+        if mp4:
+            for video in json.loads('[%s]' % mp4):
+                bitrate = video['bitrate']
+                fmt = {
+                    'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX,
+                    'format_id': bitrate,
+                }
+                m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate)
+                if m:
+                    fmt['vbr'] = int(m.group('vbr'))
+                formats.append(fmt)
+        else:
+            for video in smil.findall(
+                    './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
+                vbr = int(video.attrib['system-bitrate']) / 1000
+                formats.append({
+                    'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX),
+                    'format_id': '%dk' % vbr,
+                    'vbr': vbr,
+                })
+
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage)
+        TITLE_SUFFIX = ' : HowStuffWorks'
+        if title.endswith(TITLE_SUFFIX):
+            title = title[:-len(TITLE_SUFFIX)]
+
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            'id': content_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }

+ 80 - 0
youtube_dl/extractor/jove.py

@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate
+)
+
+
+class JoveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
+    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
+    _TESTS = [
+        {
+            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
+            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
+            'info_dict': {
+                'id': '2744',
+                'ext': 'mp4',
+                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
+                'description': 'md5:015dd4509649c0908bc27f049e0262c6',
+                'thumbnail': 're:^https?://.*\.png$',
+                'upload_date': '20110523',
+            }
+        },
+        {
+            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
+            'md5': '914aeb356f416811d911996434811beb',
+            'info_dict': {
+                'id': '51796',
+                'ext': 'mp4',
+                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
+                'description': 'md5:35ff029261900583970c4023b70f1dc9',
+                'thumbnail': 're:^https?://.*\.png$',
+                'upload_date': '20140802',
+            }
+        },
+
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        chapters_id = self._html_search_regex(
+            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
+
+        chapters_xml = self._download_xml(
+            self._CHAPTERS_URL.format(video_id=chapters_id),
+            video_id, note='Downloading chapters XML',
+            errnote='Failed to download chapters XML')
+
+        video_url = chapters_xml.attrib.get('video')
+        if not video_url:
+            raise ExtractorError('Failed to get the video URL')
+
+        title = self._html_search_meta('citation_title', webpage, 'title')
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_regex(
+            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
+            webpage, 'description', fatal=False)
+        publish_date = unified_strdate(self._html_search_meta(
+            'citation_publication_date', webpage, 'publish date', fatal=False))
+        comment_count = self._html_search_regex(
+            r'<meta name="num_comments" content="(\d+) Comments?"',
+            webpage, 'comment count', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'description': description,
+            'upload_date': publish_date,
+            'comment_count': comment_count,
+        }

+ 60 - 0
youtube_dl/extractor/mitele.py

@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    get_element_by_attribute,
+    parse_duration,
+    strip_jsonp,
+)
+
+
+class MiTeleIE(InfoExtractor):
+    IE_NAME = 'mitele.es'
+    _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/'
+
+    _TEST = {
+        'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
+        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
+        'info_dict': {
+            'id': '0fce117d',
+            'ext': 'mp4',
+            'title': 'Programa 144 - Tor, la web invisible',
+            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+            'display_id': 'programa-144',
+            'duration': 2913,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        episode = mobj.group('episode')
+        webpage = self._download_webpage(url, episode)
+        embed_data_json = self._search_regex(
+            r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
+            flags=re.DOTALL
+        ).replace('\'', '"')
+        embed_data = json.loads(embed_data_json)
+
+        info_url = embed_data['flashvars']['host']
+        info_el = self._download_xml(info_url, episode).find('./video/info')
+
+        video_link = info_el.find('videoUrl/link').text
+        token_query = compat_urllib_parse.urlencode({'id': video_link})
+        token_info = self._download_json(
+            'http://token.mitele.es/?' + token_query, episode,
+            transform_source=strip_jsonp
+        )
+
+        return {
+            'id': embed_data['videoId'],
+            'display_id': episode,
+            'title': info_el.find('title').text,
+            'url': token_info['tokenizedUrl'],
+            'description': get_element_by_attribute('class', 'text', webpage),
+            'thumbnail': info_el.find('thumb').text,
+            'duration': parse_duration(info_el.find('duration').text),
+        }

+ 40 - 13
youtube_dl/extractor/pbs.py

@@ -20,17 +20,41 @@ class PBSIE(InfoExtractor):
         )
         )
     '''
     '''
 
 
-    _TEST = {
-        'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
-        'md5': 'ce1888486f0908d555a8093cac9a7362',
-        'info_dict': {
-            'id': '2365006249',
-            'ext': 'mp4',
-            'title': 'A More Perfect Union',
-            'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
-            'duration': 3190,
+    _TESTS = [
+        {
+            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
+            'md5': 'ce1888486f0908d555a8093cac9a7362',
+            'info_dict': {
+                'id': '2365006249',
+                'ext': 'mp4',
+                'title': 'A More Perfect Union',
+                'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
+                'duration': 3190,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
+            'md5': '143c98aa54a346738a3d78f54c925321',
+            'info_dict': {
+                'id': '2365297690',
+                'ext': 'mp4',
+                'title': 'Losing Iraq',
+                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+                'duration': 5050,
+            },
         },
         },
-    }
+        {
+            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
+            'md5': 'b19856d7f5351b17a5ab1dc6a64be633',
+            'info_dict': {
+                'id': '2201174722',
+                'ext': 'mp4',
+                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
+                'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
+                'duration': 801,
+            },
+        },
+    ]
 
 
     def _extract_ids(self, url):
     def _extract_ids(self, url):
         mobj = re.match(self._VALID_URL, url)
         mobj = re.match(self._VALID_URL, url)
@@ -40,10 +64,13 @@ class PBSIE(InfoExtractor):
         if presumptive_id:
         if presumptive_id:
             webpage = self._download_webpage(url, display_id)
             webpage = self._download_webpage(url, display_id)
 
 
-            # frontline video embed
+            MEDIA_ID_REGEXES = [
+                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
+                r'class="coveplayerid">([^<]+)<',                       # coveplayer
+            ]
+
             media_id = self._search_regex(
             media_id = self._search_regex(
-                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",
-                webpage, 'frontline video ID', fatal=False, default=None)
+                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
             if media_id:
             if media_id:
                 return media_id, presumptive_id
                 return media_id, presumptive_id
 
 

+ 52 - 0
youtube_dl/extractor/rtlnl.py

@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class RtlXlIE(InfoExtractor):
+    IE_NAME = 'rtlxl.nl'
+    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+
+    _TEST = {
+        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
+        'info_dict': {
+            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
+            'ext': 'flv',
+            'title': 'RTL Nieuws - Laat',
+            'description': 'Dagelijks het laatste nieuws uit binnen- en '
+                'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van '
+                'onze mobiele apps.',
+            'timestamp': 1408051800,
+            'upload_date': '20140814',
+        },
+        'params': {
+            # We download the first bytes of the first fragment, it can't be
+            # processed by the f4m downloader beacuse it isn't complete
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        uuid = mobj.group('uuid')
+
+        info = self._download_json(
+            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+            uuid)
+        meta = info['meta']
+        material = info['material'][0]
+        episode_info = info['episodes'][0]
+
+        f4m_url = 'http://manifest.us.rtl.nl' + material['videopath']
+        progname = info['abstracts'][0]['name']
+        subtitle = material['title'] or info['episodes'][0]['name']
+
+        return {
+            'id': uuid,
+            'title': '%s - %s' % (progname, subtitle), 
+            'formats': self._extract_f4m_formats(f4m_url, uuid),
+            'timestamp': material['original_date'],
+            'description': episode_info['synopsis'],
+        }

+ 1 - 1
youtube_dl/extractor/teamcoco.py

@@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor):
         video_id = mobj.group("video_id")
         video_id = mobj.group("video_id")
         if not video_id:
         if not video_id:
             video_id = self._html_search_regex(
             video_id = self._html_search_regex(
-                r'<article class="video" data-id="(\d+?)"',
+                r'data-node-id="(\d+?)"',
                 webpage, 'video id')
                 webpage, 'video id')
 
 
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id

+ 13 - 2
youtube_dl/extractor/yahoo.py

@@ -15,7 +15,7 @@ from ..utils import (
 
 
 class YahooIE(InfoExtractor):
 class YahooIE(InfoExtractor):
     IE_DESC = 'Yahoo screen and movies'
     IE_DESC = 'Yahoo screen and movies'
-    _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
+    _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
     _TESTS = [
     _TESTS = [
         {
         {
             'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
             'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -46,12 +46,23 @@ class YahooIE(InfoExtractor):
                 'title': 'The World Loves Spider-Man',
                 'title': 'The World Loves Spider-Man',
                 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
                 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
             }
             }
-        }
+        },
+        {
+            'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
+            'md5': '60e8ac193d8fb71997caa8fce54c6460',
+            'info_dict': {
+                'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
+                'ext': 'mp4',
+                'title': "Yahoo Saves 'Community'",
+                'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
+            }
+        },
     ]
     ]
 
 
     def _real_extract(self, url):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
         video_id = mobj.group('id')
+        url = mobj.group('url')
         webpage = self._download_webpage(url, video_id)
         webpage = self._download_webpage(url, video_id)
 
 
         items_json = self._search_regex(
         items_json = self._search_regex(

+ 1 - 0
youtube_dl/utils.py

@@ -827,6 +827,7 @@ def unified_strdate(date_str):
         '%b %dnd %Y %I:%M%p',
         '%b %dnd %Y %I:%M%p',
         '%b %dth %Y %I:%M%p',
         '%b %dth %Y %I:%M%p',
         '%Y-%m-%d',
         '%Y-%m-%d',
+        '%Y/%m/%d',
         '%d.%m.%Y',
         '%d.%m.%Y',
         '%d/%m/%Y',
         '%d/%m/%Y',
         '%Y/%m/%d %H:%M:%S',
         '%Y/%m/%d %H:%M:%S',