Browse Source

[ign] fix extraction(closes #24771)

Remita Amine 4 years ago
parent
commit
7f8b8bc418
2 changed files with 200 additions and 175 deletions
  1. 2 2
      youtube_dl/extractor/extractors.py
  2. 198 173
      youtube_dl/extractor/ign.py

+ 2 - 2
youtube_dl/extractor/extractors.py

@@ -470,8 +470,8 @@ from .hungama import (
 from .hypem import HypemIE
 from .hypem import HypemIE
 from .ign import (
 from .ign import (
     IGNIE,
     IGNIE,
-    OneUPIE,
-    PCMagIE,
+    IGNVideoIE,
+    IGNArticleIE,
 )
 )
 from .iheart import (
 from .iheart import (
     IHeartRadioIE,
     IHeartRadioIE,

+ 198 - 173
youtube_dl/extractor/ign.py

@@ -3,230 +3,255 @@ from __future__ import unicode_literals
 import re
 import re
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
 from ..utils import (
 from ..utils import (
+    HEADRequest,
+    determine_ext,
     int_or_none,
     int_or_none,
     parse_iso8601,
     parse_iso8601,
+    strip_or_none,
+    try_get,
 )
 )
 
 
 
 
-class IGNIE(InfoExtractor):
+class IGNBaseIE(InfoExtractor):
+    def _call_api(self, slug):
+        return self._download_json(
+            'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
+
+
+class IGNIE(IGNBaseIE):
     """
     """
     Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
     Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
     Some videos of it.ign.com are also supported
     Some videos of it.ign.com are also supported
     """
     """
 
 
-    _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)'
+    _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)'
     IE_NAME = 'ign.com'
     IE_NAME = 'ign.com'
+    _PAGE_TYPE = 'video'
 
 
-    _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s'
-    _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']'
-
-    _TESTS = [
-        {
-            'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
-            'md5': 'febda82c4bafecd2d44b6e1a18a595f8',
-            'info_dict': {
-                'id': '8f862beef863986b2785559b9e1aa599',
-                'ext': 'mp4',
-                'title': 'The Last of Us Review',
-                'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
-                'timestamp': 1370440800,
-                'upload_date': '20130605',
-                'uploader_id': 'cberidon@ign.com',
-            }
-        },
-        {
-            'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
-            'info_dict': {
-                'id': '100-little-things-in-gta-5-that-will-blow-your-mind',
-            },
-            'playlist': [
-                {
-                    'info_dict': {
-                        'id': '5ebbd138523268b93c9141af17bec937',
-                        'ext': 'mp4',
-                        'title': 'GTA 5 Video Review',
-                        'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
-                        'timestamp': 1379339880,
-                        'upload_date': '20130916',
-                        'uploader_id': 'danieljkrupa@gmail.com',
-                    },
-                },
-                {
-                    'info_dict': {
-                        'id': '638672ee848ae4ff108df2a296418ee2',
-                        'ext': 'mp4',
-                        'title': '26 Twisted Moments from GTA 5 in Slow Motion',
-                        'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
-                        'timestamp': 1386878820,
-                        'upload_date': '20131212',
-                        'uploader_id': 'togilvie@ign.com',
-                    },
-                },
-            ],
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
-            'md5': '618fedb9c901fd086f6f093564ef8558',
-            'info_dict': {
-                'id': '078fdd005f6d3c02f63d795faa1b984f',
-                'ext': 'mp4',
-                'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
-                'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.',
-                'timestamp': 1408047180,
-                'upload_date': '20140814',
-                'uploader_id': 'jamesduggan1990@gmail.com',
-            },
-        },
-        {
-            'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
-            'only_matching': True,
-        },
-        {
-            'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
-            'only_matching': True,
-        },
-        {
-            # videoId pattern
-            'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
-            'only_matching': True,
-        },
-    ]
-
-    def _find_video_id(self, webpage):
-        res_id = [
-            r'"video_id"\s*:\s*"(.*?)"',
-            r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
-            r'data-video-id="(.+?)"',
-            r'<object id="vid_(.+?)"',
-            r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
-            r'videoId&quot;\s*:\s*&quot;(.+?)&quot;',
-            r'videoId["\']\s*:\s*["\']([^"\']+?)["\']',
-        ]
-        return self._search_regex(res_id, webpage, 'video id', default=None)
+    _TESTS = [{
+        'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+        'md5': 'd2e1586d9987d40fad7867bf96a018ea',
+        'info_dict': {
+            'id': '8f862beef863986b2785559b9e1aa599',
+            'ext': 'mp4',
+            'title': 'The Last of Us Review',
+            'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
+            'timestamp': 1370440800,
+            'upload_date': '20130605',
+            'tags': 'count:9',
+        }
+    }, {
+        'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
+        'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
+        'info_dict': {
+            'id': 'ee10d774b508c9b8ec07e763b9125b91',
+            'ext': 'mp4',
+            'title': 'What\'s New Now: Is GoGo Snooping on Your Data?',
+            'description': 'md5:817a20299de610bd56f13175386da6fa',
+            'timestamp': 1420571160,
+            'upload_date': '20150106',
+            'tags': 'count:4',
+        }
+    }, {
+        'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
+        'only_matching': True,
+    }]
 
 
     def _real_extract(self, url):
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        name_or_id = mobj.group('name_or_id')
-        page_type = mobj.group('type')
-        webpage = self._download_webpage(url, name_or_id)
-        if page_type != 'video':
-            multiple_urls = re.findall(
-                r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',
-                webpage)
-            if multiple_urls:
-                entries = [self.url_result(u, ie='IGN') for u in multiple_urls]
-                return {
-                    '_type': 'playlist',
-                    'id': name_or_id,
-                    'entries': entries,
-                }
-
-        video_id = self._find_video_id(webpage)
-        if not video_id:
-            return self.url_result(self._search_regex(
-                self._EMBED_RE, webpage, 'embed url'))
-        return self._get_video_info(video_id)
-
-    def _get_video_info(self, video_id):
-        api_data = self._download_json(
-            self._API_URL_TEMPLATE % video_id, video_id)
+        display_id = self._match_id(url)
+        video = self._call_api(display_id)
+        video_id = video['videoId']
+        metadata = video['metadata']
+        title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
 
 
         formats = []
         formats = []
-        m3u8_url = api_data['refs'].get('m3uUrl')
+        refs = video.get('refs') or {}
+
+        m3u8_url = refs.get('m3uUrl')
         if m3u8_url:
         if m3u8_url:
             formats.extend(self._extract_m3u8_formats(
             formats.extend(self._extract_m3u8_formats(
                 m3u8_url, video_id, 'mp4', 'm3u8_native',
                 m3u8_url, video_id, 'mp4', 'm3u8_native',
                 m3u8_id='hls', fatal=False))
                 m3u8_id='hls', fatal=False))
-        f4m_url = api_data['refs'].get('f4mUrl')
+
+        f4m_url = refs.get('f4mUrl')
         if f4m_url:
         if f4m_url:
             formats.extend(self._extract_f4m_formats(
             formats.extend(self._extract_f4m_formats(
                 f4m_url, video_id, f4m_id='hds', fatal=False))
                 f4m_url, video_id, f4m_id='hds', fatal=False))
-        for asset in api_data['assets']:
+
+        for asset in (video.get('assets') or []):
+            asset_url = asset.get('url')
+            if not asset_url:
+                continue
             formats.append({
             formats.append({
-                'url': asset['url'],
-                'tbr': asset.get('actual_bitrate_kbps'),
-                'fps': asset.get('frame_rate'),
+                'url': asset_url,
+                'tbr': int_or_none(asset.get('bitrate'), 1000),
+                'fps': int_or_none(asset.get('frame_rate')),
                 'height': int_or_none(asset.get('height')),
                 'height': int_or_none(asset.get('height')),
                 'width': int_or_none(asset.get('width')),
                 'width': int_or_none(asset.get('width')),
             })
             })
+
+        mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl'])
+        if mezzanine_url:
+            formats.append({
+                'ext': determine_ext(mezzanine_url, 'mp4'),
+                'format_id': 'mezzanine',
+                'preference': 1,
+                'url': mezzanine_url,
+            })
+
         self._sort_formats(formats)
         self._sort_formats(formats)
 
 
-        thumbnails = [{
-            'url': thumbnail['url']
-        } for thumbnail in api_data.get('thumbnails', [])]
+        thumbnails = []
+        for thumbnail in (video.get('thumbnails') or []):
+            thumbnail_url = thumbnail.get('url')
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+            })
 
 
-        metadata = api_data['metadata']
+        tags = []
+        for tag in (video.get('tags') or []):
+            display_name = tag.get('displayName')
+            if not display_name:
+                continue
+            tags.append(display_name)
 
 
         return {
         return {
-            'id': api_data.get('videoId') or video_id,
-            'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'],
-            'description': metadata.get('description'),
+            'id': video_id,
+            'title': title,
+            'description': strip_or_none(metadata.get('description')),
             'timestamp': parse_iso8601(metadata.get('publishDate')),
             'timestamp': parse_iso8601(metadata.get('publishDate')),
             'duration': int_or_none(metadata.get('duration')),
             'duration': int_or_none(metadata.get('duration')),
-            'display_id': metadata.get('slug') or video_id,
-            'uploader_id': metadata.get('creator'),
+            'display_id': display_id,
             'thumbnails': thumbnails,
             'thumbnails': thumbnails,
             'formats': formats,
             'formats': formats,
+            'tags': tags,
         }
         }
 
 
 
 
-class OneUPIE(IGNIE):
-    _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html'
-    IE_NAME = '1up.com'
-
+class IGNVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
     _TESTS = [{
     _TESTS = [{
-        'url': 'http://gamevideos.1up.com/video/id/34976.html',
-        'md5': 'c9cc69e07acb675c31a16719f909e347',
+        'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
+        'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1',
         'info_dict': {
         'info_dict': {
-            'id': '34976',
+            'id': 'e9be7ea899a9bbfc0674accc22a36cc8',
             'ext': 'mp4',
             'ext': 'mp4',
-            'title': 'Sniper Elite V2 - Trailer',
-            'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826',
-            'timestamp': 1313099220,
-            'upload_date': '20110811',
-            'uploader_id': 'IGN',
+            'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015',
+            'description': 'Taking out assassination targets in Hitman has never been more stylish.',
+            'timestamp': 1444665600,
+            'upload_date': '20151012',
         }
         }
+    }, {
+        'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
+        'only_matching': True,
+    }, {
+        # Youtube embed
+        'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed',
+        'only_matching': True,
+    }, {
+        # Twitter embed
+        'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed',
+        'only_matching': True,
+    }, {
+        # Vimeo embed
+        'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed',
+        'only_matching': True,
     }]
     }]
 
 
     def _real_extract(self, url):
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        result = super(OneUPIE, self)._real_extract(url)
-        result['id'] = mobj.group('name_or_id')
-        return result
-
-
-class PCMagIE(IGNIE):
-    _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)'
-    IE_NAME = 'pcmag'
+        video_id = self._match_id(url)
+        req = HEADRequest(url.rsplit('/', 1)[0] + '/embed')
+        url = self._request_webpage(req, video_id).geturl()
+        ign_url = compat_parse_qs(
+            compat_urllib_parse_urlparse(url).query).get('url', [None])[0]
+        if ign_url:
+            return self.url_result(ign_url, IGNIE.ie_key())
+        return self.url_result(url)
 
 
-    _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]'
 
 
+class IGNArticleIE(IGNBaseIE):
+    _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)'
+    _PAGE_TYPE = 'article'
     _TESTS = [{
     _TESTS = [{
-        'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
-        'md5': '212d6154fd0361a2781075f1febbe9ad',
+        'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
         'info_dict': {
         'info_dict': {
-            'id': 'ee10d774b508c9b8ec07e763b9125b91',
-            'ext': 'mp4',
-            'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?',
-            'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3',
-            'timestamp': 1420571160,
-            'upload_date': '20150106',
-            'uploader_id': 'cozzipix@gmail.com',
-        }
+            'id': '524497489e4e8ff5848ece34',
+            'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
+        },
+        'playlist': [
+            {
+                'info_dict': {
+                    'id': '5ebbd138523268b93c9141af17bec937',
+                    'ext': 'mp4',
+                    'title': 'GTA 5 Video Review',
+                    'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
+                    'timestamp': 1379339880,
+                    'upload_date': '20130916',
+                },
+            },
+            {
+                'info_dict': {
+                    'id': '638672ee848ae4ff108df2a296418ee2',
+                    'ext': 'mp4',
+                    'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+                    'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
+                    'timestamp': 1386878820,
+                    'upload_date': '20131212',
+                },
+            },
+        ],
+        'params': {
+            'playlist_items': '2-3',
+            'skip_download': True,
+        },
     }, {
     }, {
-        'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp',
-        'md5': '94130c1ca07ba0adb6088350681f16c1',
+        'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
         'info_dict': {
         'info_dict': {
-            'id': '042e560ba94823d43afcb12ddf7142ca',
-            'ext': 'mp4',
-            'title': 'HTC\'s Weird New Re Camera - What\'s New Now',
-            'description': 'md5:53433c45df96d2ea5d0fda18be2ca908',
-            'timestamp': 1412953920,
-            'upload_date': '20141010',
-            'uploader_id': 'chris_snyder@pcmag.com',
-        }
+            'id': '53ee806780a81ec46e0790f8',
+            'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
+        },
+        'playlist_count': 2,
+    }, {
+        # videoId pattern
+        'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
+        'only_matching': True,
+    }, {
+        # Youtube embed
+        'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii',
+        'only_matching': True,
+    }, {
+        # IMDB embed
+        'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer',
+        'only_matching': True,
+    }, {
+        # Facebook embed
+        'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series',
+        'only_matching': True,
+    }, {
+        # Brightcove embed
+        'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip',
+        'only_matching': True,
     }]
     }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        article = self._call_api(display_id)
+
+        def entries():
+            media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url'])
+            if media_url:
+                yield self.url_result(media_url, IGNIE.ie_key())
+            for content in (article.get('content') or []):
+                for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
+                    yield self.url_result(video_url)
+
+        return self.playlist_result(
+            entries(), article.get('articleId'),
+            strip_or_none(try_get(article, lambda x: x['metadata']['headline'])))