Browse Source

[yahoo] Fix video extraction (fixes #1521)

There's no need to use two different methods.
Now we can also download videos over http if possible.
Also run the test for rtmp videos, but skip the download.
Jaime Marquínez Ferrándiz 12 years ago
parent
commit
9c15e9de84
1 changed files with 65 additions and 67 deletions
  1. 65 67
      youtube_dl/extractor/yahoo.py

+ 65 - 67
youtube_dl/extractor/yahoo.py

@@ -1,4 +1,3 @@
-import datetime
 import itertools
 import json
 import re
@@ -6,86 +5,85 @@ import re
 from .common import InfoExtractor, SearchInfoExtractor
 from ..utils import (
     compat_urllib_parse,
-
-    ExtractorError,
+    compat_urlparse,
+    determine_ext,
+    clean_html,
 )
 
+
 class YahooIE(InfoExtractor):
     IE_DESC = u'Yahoo screen'
     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
-    _TEST = {
-        u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
-        u'file': u'214727115.flv',
-        u'md5': u'2e717f169c1be93d84d3794a00d4a325',
-        u'info_dict': {
-            u"title": u"Julian Smith & Travis Legg Watch Julian Smith"
+    _TESTS = [
+        {
+            u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
+            u'file': u'214727115.mp4',
+            u'info_dict': {
+                u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
+                u'description': u'Julian and Travis watch Julian Smith',
+            },
         },
-        u'skip': u'Requires rtmpdump'
-    }
+        {
+            u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
+            u'file': u'103000935.flv',
+            u'info_dict': {
+                u'title': u'The Cougar Lies with Spanish Moss',
+                u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
+            },
+            u'params': {
+                # Requires rtmpdump
+                u'skip_download': True,
+            },
+        },
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
-        m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 
-        if m_id is None: 
-            # TODO: Check which url parameters are required
-            info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
-            webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
-            info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
-                        <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
-                        <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
-                        <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
-                        '''
-            self.report_extraction(video_id)
-            m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
-            if m_info is None:
-                raise ExtractorError(u'Unable to extract video info')
-            video_title = m_info.group('title')
-            video_description = m_info.group('description')
-            video_thumb = m_info.group('thumb')
-            video_date = m_info.group('date')
-            video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
-    
-            # TODO: Find a way to get mp4 videos
-            rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
-            webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
-            m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
-            video_url = m_rest.group('url')
-            video_path = m_rest.group('path')
-            if m_rest is None:
-                raise ExtractorError(u'Unable to extract video url')
+        items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
+            webpage, u'items', flags=re.MULTILINE)
+        items = json.loads(items_json)
+        info = items['mediaItems']['query']['results']['mediaObj'][0]
+        meta = info['meta']
+
+        formats = []
+        for s in info['streams']:
+            format_info = {
+                'width': s.get('width'),
+                'height': s.get('height'),
+                'bitrate': s.get('bitrate'),
+            }
+
+            host = s['host']
+            path = s['path']
+            if host.startswith('rtmp'):
+                format_info.update({
+                    'url': host,
+                    'play_path': path,
+                    'ext': 'flv',
+                })
+            else:
+                format_url = compat_urlparse.urljoin(host, path)
+                format_info['url'] = format_url
+                format_info['ext'] = determine_ext(format_url)
+                
+            formats.append(format_info)
+        formats = sorted(formats, key=lambda f:(f['height'], f['width']))
+
+        info = {
+            'id': video_id,
+            'title': meta['title'],
+            'formats': formats,
+            'description': clean_html(meta['description']),
+            'thumbnail': meta['thumbnail'],
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
 
-        else: # We have to use a different method if another id is defined
-            long_id = m_id.group('new_id')
-            info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
-            webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
-            json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
-            info = json.loads(json_str)
-            res = info[u'query'][u'results'][u'mediaObj'][0]
-            stream = res[u'streams'][0]
-            video_path = stream[u'path']
-            video_url = stream[u'host']
-            meta = res[u'meta']
-            video_title = meta[u'title']
-            video_description = meta[u'description']
-            video_thumb = meta[u'thumbnail']
-            video_date = None # I can't find it
+        return info
 
-        info_dict = {
-                     'id': video_id,
-                     'url': video_url,
-                     'play_path': video_path,
-                     'title':video_title,
-                     'description': video_description,
-                     'thumbnail': video_thumb,
-                     'upload_date': video_date,
-                     'ext': 'flv',
-                     }
-        return info_dict
 
 class YahooSearchIE(SearchInfoExtractor):
     IE_DESC = u'Yahoo screen search'