Browse Source

[huffpost] Add support

Philipp Hagemeister 11 years ago
parent
commit
db1f388878

+ 4 - 1
youtube_dl/downloader/__init__.py

@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 from .common import FileDownloader
 from .common import FileDownloader
 from .hls import HlsFD
 from .hls import HlsFD
 from .http import HttpFD
 from .http import HttpFD
@@ -12,10 +14,11 @@ from ..utils import (
 def get_suitable_downloader(info_dict):
 def get_suitable_downloader(info_dict):
     """Get the downloader class that can handle the info dict."""
     """Get the downloader class that can handle the info dict."""
     url = info_dict['url']
     url = info_dict['url']
+    protocol = info_dict.get('protocol')
 
 
     if url.startswith('rtmp'):
     if url.startswith('rtmp'):
         return RtmpFD
         return RtmpFD
-    if determine_ext(url) == u'm3u8':
+    if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'):
         return HlsFD
         return HlsFD
     if url.startswith('mms') or url.startswith('rtsp'):
     if url.startswith('mms') or url.startswith('rtsp'):
         return MplayerFD
         return MplayerFD

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -83,6 +83,7 @@ from .googlesearch import GoogleSearchIE
 from .hark import HarkIE
 from .hark import HarkIE
 from .hotnewhiphop import HotNewHipHopIE
 from .hotnewhiphop import HotNewHipHopIE
 from .howcast import HowcastIE
 from .howcast import HowcastIE
+from .huffpost import HuffPostIE
 from .hypem import HypemIE
 from .hypem import HypemIE
 from .ign import IGNIE, OneUPIE
 from .ign import IGNIE, OneUPIE
 from .imdb import (
 from .imdb import (

+ 1 - 1
youtube_dl/extractor/common.py

@@ -71,7 +71,7 @@ class InfoExtractor(object):
                     * player_url SWF Player URL (used for rtmpdump).
                     * player_url SWF Player URL (used for rtmpdump).
                     * protocol   The protocol that will be used for the actual
                     * protocol   The protocol that will be used for the actual
                                  download, lower-case.
                                  download, lower-case.
-                                 "http", "https", "rtsp", "rtmp" or so.
+                                 "http", "https", "rtsp", "rtmp", "m3u8" or so.
                     * preference Order number of this format. If this field is
                     * preference Order number of this format. If this field is
                                  present and not None, the formats get sorted
                                  present and not None, the formats get sorted
                                  by this field.
                                  by this field.

+ 7 - 1
youtube_dl/extractor/generic.py

@@ -332,10 +332,16 @@ class GenericIE(InfoExtractor):
 
 
         # Look for embedded Facebook player
         # Look for embedded Facebook player
         mobj = re.search(
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
         if mobj is not None:
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Facebook')
             return self.url_result(mobj.group('url'), 'Facebook')
 
 
+        # Look for embedded Huffington Post player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live.huffingtonpost\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'HuffPost')
+
         # Start with something easy: JW Player in SWFObject
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if mobj is None:
         if mobj is None:

+ 70 - 0
youtube_dl/extractor/huffpost.py

@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    unified_strdate,
+)
+
+
+class HuffPostIE(InfoExtractor):
+    IE_DESC = 'Huffington Post'
+    _VALID_URL = r'''(?x)
+        https?://(embed\.)?live\.huffingtonpost\.com/
+        (?:
+            r/segment/[^/]+/|
+            HPLEmbedPlayer/\?segmentId=
+        )
+        (?P<id>[0-9a-f]+)'''
+
+    _TEST = {
+        'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
+        'file': '52dd3e4b02a7602131000677.mp4',
+        'md5': 'TODO',
+        'info_dict': {
+            'title': 'TODO',
+            'description': 'TODO',
+            'duration': 1549,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
+        data = self._download_json(api_url, video_id)['data']
+
+        video_title = data['title']
+        duration = parse_duration(data['running_time'])
+        upload_date = unified_strdate(data['schedule']['started_at'])
+
+        thumbnails = []
+        for url in data['images'].values():
+            m = re.match('.*-([0-9]+x[0-9]+)\.', url)
+            if not m:
+                continue
+            thumbnails.append({
+                'url': url,
+                'resolution': m.group(1),
+            })
+
+        formats = [{
+            'format': key,
+            'format_id': key.replace('/', '.'),
+            'ext': 'mp4',
+            'url': url,
+            'vcodec': 'none' if key.startswith('audio/') else None,
+        } for key, url in data['sources']['live'].items()]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'duration': duration,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
+        }