Browse Source

[mlb] Extract more metadata and all formats, provide more tests

Sergey M․ 11 years ago
parent
commit
7bb49d1057
2 changed files with 81 additions and 53 deletions
  1. 1 1
      youtube_dl/extractor/__init__.py
  2. 80 52
      youtube_dl/extractor/mlb.py

+ 1 - 1
youtube_dl/extractor/__init__.py

@@ -170,7 +170,7 @@ from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .metacritic import MetacriticIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mixcloud import MixcloudIE
 from .mixcloud import MixcloudIE
-from .mlb import MlbIE
+from .mlb import MLBIE
 from .mpora import MporaIE
 from .mpora import MporaIE
 from .mofosex import MofosexIE
 from .mofosex import MofosexIE
 from .mooshare import MooshareIE
 from .mooshare import MooshareIE

+ 80 - 52
youtube_dl/extractor/mlb.py

@@ -3,72 +3,100 @@ from __future__ import unicode_literals
 import re
 import re
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+    find_xpath_attr,
+)
 
 
 
 
-class MlbIE(InfoExtractor):
-    _VALID_URL = r'http?://m\.mlb\.com/video/topic/[0-9]+/v(?P<id>n?\d+)/.*$'
-    _TEST = {
-        'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
-        'md5': u'd9c022c10d21f849f49c05ae12a8a7e9',
-        'info_dict': {
-            'id': '34496663',
-            'ext': 'mp4',
-            'format': 'mp4',
-            'description': "7/11/14: Giancarlo Stanton practices for the Home Run Derby prior to the game against the Mets",
-            'title': "Stanton prepares for Derby",
+class MLBIE(InfoExtractor):
+    _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'
+    _TESTS = [
+        {
+            'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby',
+            'md5': 'd9c022c10d21f849f49c05ae12a8a7e9',
+            'info_dict': {
+                'id': '34496663',
+                'ext': 'mp4',
+                'title': 'Stanton prepares for Derby',
+                'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
+                'duration': 46,
+                'timestamp': 1405105800,
+                'upload_date': '20140711',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
         },
         },
-    }
+        {
+            'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby',
+            'md5': '0e6e73d509321e142409b695eadd541f',
+            'info_dict': {
+                'id': '34578115',
+                'ext': 'mp4',
+                'title': 'Cespedes repeats as Derby champ',
+                'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
+                'duration': 488,
+                'timestamp': 1405399936,
+                'upload_date': '20140715',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance',
+            'md5': 'b8fd237347b844365d74ea61d4245967',
+            'info_dict': {
+                'id': '34577915',
+                'ext': 'mp4',
+                'title': 'Bautista on Home Run Derby',
+                'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
+                'duration': 52,
+                'timestamp': 1405390722,
+                'upload_date': '20140715',
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+        },
+    ]
 
 
     def _real_extract(self, url):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
         video_id = mobj.group('id')
 
 
-        webpage = self._download_webpage(url, video_id)
+        detail = self._download_xml(
+            'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
+            % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
+
+        title = detail.find('./headline').text
+        description = detail.find('./big-blurb').text
+        duration = parse_duration(detail.find('./duration').text)
+        timestamp = parse_iso8601(detail.attrib['date'][:-5])
+
+        thumbnail = find_xpath_attr(
+            detail, './thumbnailScenarios/thumbnailScenario', 'type', '45').text
 
 
-        title = self._og_search_title(webpage, default=video_id)
-        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)"/>', webpage, 'description', fatal=False)
-        thumbnail = self._html_search_regex(r'<meta itemprop="image" (?:content|value)="(.*?)" />', webpage, 'image', fatal=False)
+        formats = []
+        for media_url in detail.findall('./url'):
+            playback_scenario = media_url.attrib['playback_scenario']
+            fmt = {
+                'url': media_url.text,
+                'format_id': playback_scenario,
+            }
+            m = re.search(r'(?P<vbr>\d+)K_(?P<width>\d+)X(?P<height>\d+)', playback_scenario)
+            if m:
+                fmt.update({
+                    'vbr': int(m.group('vbr')) * 1000,
+                    'width': int(m.group('width')),
+                    'height': int(m.group('height')),
+                })
+            formats.append(fmt)
 
 
-        # use the video_id to find the Media detail XML
-        id_len = len(video_id)
-        _mediadetail_url = 'http://m.mlb.com/gen/multimedia/detail/'+video_id[id_len-3]+'/'+video_id[id_len-2]+'/'+video_id[id_len-1]+'/'+video_id+'.xml'
-        
-        mediadetails = self._download_xml(_mediadetail_url, video_id, "Downloading media detail...")
-        has1500K = 0
-        has1200K = 0
-        has600K = 0
-        # loop through the list of url's and only get the highest quality MP4 content
-        for element in mediadetails.findall('url'):
-            scenario = element.attrib['playback_scenario']
-            if scenario.startswith(u'FLASH'):
-                if scenario.startswith(u'FLASH_1800K'):
-                    video_url = element.text
-                    # 1800K is the current highest quality video on MLB.com
-                    break
-                else:
-                    if scenario.startswith(u'FLASH_1500K'):
-                        video_url = element.text
-                        has1500K = 1
-                    else:
-                        if (scenario.startswith(u'FLASH_1200K') and not has1500K):
-                            video_url = element.text
-                            has1200K = 1
-                        else:
-                            if (scenario.startswith(u'FLASH_600K') and not has1200K):
-                                video_url = element.text
-                                has600K = 1
-                            else:
-                                if (scenario.startswith(u'FLASH_300K') and not has600K):
-                                    video_url = element.text
+        self._sort_formats(formats)
 
 
         return {
         return {
             'id': video_id,
             'id': video_id,
-            'url': video_url,
-            'extractor': 'mlb',
-            'webpage_url': url,
             'title': title,
             'title': title,
-            'ext': 'mp4',
-            'format': 'mp4',
             'description': description,
             'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
             'thumbnail': thumbnail,
             'thumbnail': thumbnail,
         }
         }