浏览代码

ArteTvIE: rewrite the extract process to support the new site (fixes #875)

The video can be downloaded with rtmp or http, but the best quality format seems to always use rtmp.
Deleted the old methods.
Jaime Marquínez Ferrándiz 12 年之前
父节点
当前提交
75c9481224
共有 1 个文件被更改,包括 36 次插入86 次删除
  1. 36 86
      youtube_dl/extractor/arte.py

+ 36 - 86
youtube_dl/extractor/arte.py

@@ -1,53 +1,21 @@
 import re
 import re
-import socket
+import json
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
 from ..utils import (
 from ..utils import (
-    compat_http_client,
-    compat_str,
-    compat_urllib_error,
+    # This is used by the not implemented extractLiveStream method
     compat_urllib_parse,
     compat_urllib_parse,
-    compat_urllib_request,
 
 
     ExtractorError,
     ExtractorError,
     unified_strdate,
     unified_strdate,
 )
 )
 
 
 class ArteTvIE(InfoExtractor):
 class ArteTvIE(InfoExtractor):
-    """arte.tv information extractor."""
-
-    _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
+    _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
     _LIVE_URL = r'index-[0-9]+\.html$'
     _LIVE_URL = r'index-[0-9]+\.html$'
 
 
     IE_NAME = u'arte.tv'
     IE_NAME = u'arte.tv'
 
 
-    def fetch_webpage(self, url):
-        request = compat_urllib_request.Request(url)
-        try:
-            self.report_download_webpage(url)
-            webpage = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
-        except ValueError as err:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        return webpage
-
-    def grep_webpage(self, url, regex, regexFlags, matchTuples):
-        page = self.fetch_webpage(url)
-        mobj = re.search(regex, page, regexFlags)
-        info = {}
-
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        for (i, key, err) in matchTuples:
-            if mobj.group(i) is None:
-                raise ExtractorError(err)
-            else:
-                info[key] = mobj.group(i)
-
-        return info
-
     # TODO implement Live Stream
     # TODO implement Live Stream
     # def extractLiveStream(self, url):
     # def extractLiveStream(self, url):
     #     video_lang = url.split('/')[-4]
     #     video_lang = url.split('/')[-4]
@@ -75,62 +43,44 @@ class ArteTvIE(InfoExtractor):
     #     )
     #     )
     #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))
     #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 
 
-    def extractPlus7Stream(self, url):
-        video_lang = url.split('/')[-3]
-        info = self.grep_webpage(
-            url,
-            r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
-            0,
-            [
-                (1, 'url', u'Invalid URL: %s' % url)
-            ]
-        )
-        next_url = compat_urllib_parse.unquote(info.get('url'))
-        info = self.grep_webpage(
-            next_url,
-            r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
-            0,
-            [
-                (1, 'url', u'Could not find <video> tag: %s' % url)
-            ]
-        )
-        next_url = compat_urllib_parse.unquote(info.get('url'))
-
-        info = self.grep_webpage(
-            next_url,
-            r'<video id="(.*?)".*?>.*?' +
-                '<name>(.*?)</name>.*?' +
-                '<dateVideo>(.*?)</dateVideo>.*?' +
-                '<url quality="hd">(.*?)</url>',
-            re.DOTALL,
-            [
-                (1, 'id',    u'could not extract video id: %s' % url),
-                (2, 'title', u'could not extract video title: %s' % url),
-                (3, 'date',  u'could not extract video date: %s' % url),
-                (4, 'url',   u'could not extract video url: %s' % url)
-            ]
-        )
-
-        return {
-            'id':           info.get('id'),
-            'url':          compat_urllib_parse.unquote(info.get('url')),
-            'uploader':     u'arte.tv',
-            'upload_date':  unified_strdate(info.get('date')),
-            'title':        info.get('title').decode('utf-8'),
-            'ext':          u'mp4',
-            'format':       u'NA',
-            'player_url':   None,
-        }
-
     def _real_extract(self, url):
     def _real_extract(self, url):
-        video_id = url.split('/')[-1]
-        self.report_extraction(video_id)
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        # This is not a real id, it can be for example AJT for the news
+        # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
+        video_id = mobj.group('id')
 
 
         if re.search(self._LIVE_URL, video_id) is not None:
         if re.search(self._LIVE_URL, video_id) is not None:
             raise ExtractorError(u'Arte live streams are not yet supported, sorry')
             raise ExtractorError(u'Arte live streams are not yet supported, sorry')
             # self.extractLiveStream(url)
             # self.extractLiveStream(url)
             # return
             # return
+
+        webpage = self._download_webpage(url, video_id)
+        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+
+        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
+        self.report_extraction(video_id)
+        info = json.loads(json_info)
+        player_info = info['videoJsonPlayer']
+
+        info_dict = {'id': player_info['VID'],
+                     'title': player_info['VTI'],
+                     'description': player_info['VDE'],
+                     'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
+                     'thumbnail': player_info['programImage'],
+                     }
+
+        formats = player_info['VSR'].values()
+        # We order the formats by quality
+        formats = sorted(formats, key=lambda f: int(f['height']))
+        # Pick the best quality
+        format_info = formats[-1]
+        if format_info['mediaType'] == u'rtmp':
+            info_dict['url'] = format_info['streamer']
+            info_dict['play_path'] = 'mp4:' + format_info['url']
+            info_dict['ext'] = 'mp4'
         else:
         else:
-            info = self.extractPlus7Stream(url)
+            info_dict['url'] = format_info['url']
+            info_dict['ext'] = 'mp4'
 
 
-        return [info]
+        return info_dict