فهرست منبع

[appletrailers] Rework extraction (fixes #1387)

The exraction was broken:
* The includes page contains img elements that need to be fixed.
* Use the 'itunes.inc' page, it contains a json dictionary for each trailer with information.
* Get the formats from 'includes/settings{trailer_name}.json'
* Use urljoin to allow urls with a fragment identifier to work

Removed the thumbnail urls from the tests, they are different now.
Jaime Marquínez Ferrándiz 12 سال پیش
والد
کامیت
843530568f
1فایلهای تغییر یافته به همراه42 افزوده شده و 70 حذف شده
  1. 42 70
      youtube_dl/extractor/appletrailers.py

+ 42 - 70
youtube_dl/extractor/appletrailers.py

@@ -1,8 +1,10 @@
 import re
 import xml.etree.ElementTree
+import json
 
 from .common import InfoExtractor
 from ..utils import (
+    compat_urlparse,
     determine_ext,
 )
 
@@ -14,10 +16,9 @@ class AppleTrailersIE(InfoExtractor):
         u"playlist": [
             {
                 u"file": u"manofsteel-trailer4.mov",
-                u"md5": u"11874af099d480cc09e103b189805d5f",
+                u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8",
                 u"info_dict": {
                     u"duration": 111,
-                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg",
                     u"title": u"Trailer 4",
                     u"upload_date": u"20130523",
                     u"uploader_id": u"wb",
@@ -25,10 +26,9 @@ class AppleTrailersIE(InfoExtractor):
             },
             {
                 u"file": u"manofsteel-trailer3.mov",
-                u"md5": u"07a0a262aae5afe68120eed61137ab34",
+                u"md5": u"b8017b7131b721fb4e8d6f49e1df908c",
                 u"info_dict": {
                     u"duration": 182,
-                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg",
                     u"title": u"Trailer 3",
                     u"upload_date": u"20130417",
                     u"uploader_id": u"wb",
@@ -36,10 +36,9 @@ class AppleTrailersIE(InfoExtractor):
             },
             {
                 u"file": u"manofsteel-trailer.mov",
-                u"md5": u"e401fde0813008e3307e54b6f384cff1",
+                u"md5": u"d0f1e1150989b9924679b441f3404d48",
                 u"info_dict": {
                     u"duration": 148,
-                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg",
                     u"title": u"Trailer",
                     u"upload_date": u"20121212",
                     u"uploader_id": u"wb",
@@ -47,10 +46,9 @@ class AppleTrailersIE(InfoExtractor):
             },
             {
                 u"file": u"manofsteel-teaser.mov",
-                u"md5": u"76b392f2ae9e7c98b22913c10a639c97",
+                u"md5": u"5fe08795b943eb2e757fa95cb6def1cb",
                 u"info_dict": {
                     u"duration": 93,
-                    u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg",
                     u"title": u"Teaser",
                     u"upload_date": u"20120721",
                     u"uploader_id": u"wb",
@@ -59,87 +57,61 @@ class AppleTrailersIE(InfoExtractor):
         ]
     }
 
+    _JSON_RE = r'iTunes.playURL\((.*?)\);'
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         movie = mobj.group('movie')
         uploader_id = mobj.group('company')
 
-        playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc'
+        playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
         playlist_snippet = self._download_webpage(playlist_url, movie)
-        playlist_cleaned = re.sub(r'(?s)<script>.*?</script>', u'', playlist_snippet)
+        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
+        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
+        # The ' in the onClick attributes are not escaped, it couldn't be parsed
+        # with xml.etree.ElementTree.fromstring
+        # like: http://trailers.apple.com/trailers/wb/gravity/
+        def _clean_json(m):
+            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
         playlist_html = u'<html>' + playlist_cleaned + u'</html>'
 
-        size_cache = {}
-
         doc = xml.etree.ElementTree.fromstring(playlist_html)
         playlist = []
         for li in doc.findall('./div/ul/li'):
-            title = li.find('.//h3').text
+            on_click = li.find('.//a').attrib['onClick']
+            trailer_info_json = self._search_regex(self._JSON_RE,
+                on_click, u'trailer info')
+            trailer_info = json.loads(trailer_info_json)
+            title = trailer_info['title']
             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
             thumbnail = li.find('.//img').attrib['src']
+            upload_date = trailer_info['posted'].replace('-', '')
 
-            date_el = li.find('.//p')
-            upload_date = None
-            m = re.search(r':\s?(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<year>[0-9]{2})', date_el.text)
-            if m:
-                upload_date = u'20' + m.group('year') + m.group('month') + m.group('day')
-            runtime_el = date_el.find('./br')
-            m = re.search(r':\s?(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime_el.tail)
+            runtime = trailer_info['runtime']
+            m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
             duration = None
             if m:
                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 
-            formats = []
-            for formats_el in li.findall('.//a'):
-                if formats_el.attrib['class'] != 'OverlayPanel':
-                    continue
-                target = formats_el.attrib['target']
-
-                format_code = formats_el.text
-                if 'Automatic' in format_code:
-                    continue
+            first_url = trailer_info['url']
+            trailer_id = first_url.split('/')[-1].rpartition('_')[0]
+            settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
+            settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
+            settings = json.loads(settings_json)
 
-                size_q = formats_el.attrib['href']
-                size_id = size_q.rpartition('#videos-')[2]
-                if size_id not in size_cache:
-                    size_url = url + size_q
-                    sizepage_html = self._download_webpage(
-                        size_url, movie,
-                        note=u'Downloading size info %s' % size_id,
-                        errnote=u'Error while downloading size info %s' % size_id,
-                    )
-                    _doc = xml.etree.ElementTree.fromstring(sizepage_html)
-                    size_cache[size_id] = _doc
-
-                sizepage_doc = size_cache[size_id]
-                links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a')
-                for vid_a in links:
-                    href = vid_a.get('href')
-                    if not href.endswith(target):
-                        continue
-                    detail_q = href.partition('#')[0]
-                    detail_url = url + '/' + detail_q
-
-                    m = re.match(r'includes/(?P<detail_id>[^/]+)/', detail_q)
-                    detail_id = m.group('detail_id')
-
-                    detail_html = self._download_webpage(
-                        detail_url, movie,
-                        note=u'Downloading detail %s %s' % (detail_id, size_id),
-                        errnote=u'Error while downloading detail %s %s' % (detail_id, size_id)
-                    )
-                    detail_doc = xml.etree.ElementTree.fromstring(detail_html)
-                    movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a')
-                    assert movie_link_el.get('class') == 'movieLink'
-                    movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h')
-                    ext = determine_ext(movie_link)
-                    assert ext == 'mov'
-
-                    formats.append({
-                        'format': format_code,
-                        'ext': ext,
-                        'url': movie_link,
-                    })
+            formats = []
+            for format in settings['metadata']['sizes']:
+                # The src is a file pointing to the real video file
+                format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
+                formats.append({
+                    'url': format_url,
+                    'ext': determine_ext(format_url),
+                    'format': format['type'],
+                    'width': format['width'],
+                    'height': int(format['height']),
+                })
+            formats = sorted(formats, key=lambda f: (f['height'], f['width']))
 
             info = {
                 '_type': 'video',