浏览代码

[zdf/common] Use API in ZDF extractor.

This also comes with a lot of extra format fields
Fixes #1518
Philipp Hagemeister 11 年之前
父节点
当前提交
02dbf93f0e
共有 5 个文件被更改,包括 112 次插入70 次删除
  1. 3 17
      youtube_dl/FileDownloader.py
  2. 16 8
      youtube_dl/YoutubeDL.py
  3. 2 0
      youtube_dl/extractor/common.py
  4. 70 45
      youtube_dl/extractor/zdf.py
  5. 21 0
      youtube_dl/utils.py

+ 3 - 17
youtube_dl/FileDownloader.py

@@ -1,4 +1,3 @@
-import math
 import os
 import os
 import re
 import re
 import subprocess
 import subprocess
@@ -11,6 +10,7 @@ from .utils import (
     ContentTooShortError,
     ContentTooShortError,
     determine_ext,
     determine_ext,
     encodeFilename,
     encodeFilename,
+    format_bytes,
     sanitize_open,
     sanitize_open,
     timeconvert,
     timeconvert,
 )
 )
@@ -53,20 +53,6 @@ class FileDownloader(object):
         self._progress_hooks = []
         self._progress_hooks = []
         self.params = params
         self.params = params
 
 
-    @staticmethod
-    def format_bytes(bytes):
-        if bytes is None:
-            return 'N/A'
-        if type(bytes) is str:
-            bytes = float(bytes)
-        if bytes == 0.0:
-            exponent = 0
-        else:
-            exponent = int(math.log(bytes, 1024.0))
-        suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
-        converted = float(bytes) / float(1024 ** exponent)
-        return '%.2f%s' % (converted, suffix)
-
     @staticmethod
     @staticmethod
     def format_seconds(seconds):
     def format_seconds(seconds):
         (mins, secs) = divmod(seconds, 60)
         (mins, secs) = divmod(seconds, 60)
@@ -117,7 +103,7 @@ class FileDownloader(object):
     def format_speed(speed):
     def format_speed(speed):
         if speed is None:
         if speed is None:
             return '%10s' % '---b/s'
             return '%10s' % '---b/s'
-        return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))
+        return '%10s' % ('%s/s' % format_bytes(speed))
 
 
     @staticmethod
     @staticmethod
     def best_block_size(elapsed_time, bytes):
     def best_block_size(elapsed_time, bytes):
@@ -525,7 +511,7 @@ class FileDownloader(object):
                 self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
                 self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
                 return False
                 return False
 
 
-        data_len_str = self.format_bytes(data_len)
+        data_len_str = format_bytes(data_len)
         byte_counter = 0 + resume_len
         byte_counter = 0 + resume_len
         block_size = self.params.get('buffersize', 1024)
         block_size = self.params.get('buffersize', 1024)
         start = time.time()
         start = time.time()

+ 16 - 8
youtube_dl/YoutubeDL.py

@@ -30,6 +30,7 @@ from .utils import (
     DownloadError,
     DownloadError,
     encodeFilename,
     encodeFilename,
     ExtractorError,
     ExtractorError,
+    format_bytes,
     locked_file,
     locked_file,
     MaxDownloadsReached,
     MaxDownloadsReached,
     PostProcessingError,
     PostProcessingError,
@@ -867,9 +868,11 @@ class YoutubeDL(object):
 
 
     def list_formats(self, info_dict):
     def list_formats(self, info_dict):
         def format_note(fdict):
         def format_note(fdict):
-            if fdict.get('format_note') is not None:
-                return fdict['format_note']
             res = u''
             res = u''
+            if fdict.get('format_note') is not None:
+                res += fdict['format_note'] + u' '
+            if fdict.get('quality_name') is not None:
+                res += u'%s ' % fdict['quality_name']
             if fdict.get('vcodec') is not None:
             if fdict.get('vcodec') is not None:
                 res += u'%-5s' % fdict['vcodec']
                 res += u'%-5s' % fdict['vcodec']
             elif fdict.get('vbr') is not None:
             elif fdict.get('vbr') is not None:
@@ -886,25 +889,30 @@ class YoutubeDL(object):
                 res += 'audio'
                 res += 'audio'
             if fdict.get('abr') is not None:
             if fdict.get('abr') is not None:
                 res += u'@%3dk' % fdict['abr']
                 res += u'@%3dk' % fdict['abr']
+            if fdict.get('filesize') is not None:
+                if res:
+                    res += u', '
+                res += format_bytes(fdict['filesize'])
             return res
             return res
 
 
-        def line(format):
-            return (u'%-20s%-10s%-12s%s' % (
+        def line(format, idlen=20):
+            return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
                 format['format_id'],
                 format['format_id'],
                 format['ext'],
                 format['ext'],
                 self.format_resolution(format),
                 self.format_resolution(format),
                 format_note(format),
                 format_note(format),
-                )
-            )
+            ))
 
 
         formats = info_dict.get('formats', [info_dict])
         formats = info_dict.get('formats', [info_dict])
-        formats_s = list(map(line, formats))
+        idlen = max(len(u'format code'),
+                    max(len(f['format_id']) for f in formats))
+        formats_s = [line(f, idlen) for f in formats]
         if len(formats) > 1:
         if len(formats) > 1:
             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
 
 
         header_line = line({
         header_line = line({
             'format_id': u'format code', 'ext': u'extension',
             'format_id': u'format code', 'ext': u'extension',
-            '_resolution': u'resolution', 'format_note': u'note'})
+            '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
         self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
         self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
                        (info_dict['id'], header_line, u"\n".join(formats_s)))
                        (info_dict['id'], header_line, u"\n".join(formats_s)))

+ 2 - 0
youtube_dl/extractor/common.py

@@ -76,6 +76,8 @@ class InfoExtractor(object):
                     * acodec    Name of the audio codec in use
                     * acodec    Name of the audio codec in use
                     * vbr       Average video bitrate in KBit/s
                     * vbr       Average video bitrate in KBit/s
                     * vcodec    Name of the video codec in use
                     * vcodec    Name of the video codec in use
+                    * quality_name Human-readable name of the video quality.
+                    * filesize  The number of bytes, if known in advance
     webpage_url:    The url to the video webpage, if given to youtube-dl it
     webpage_url:    The url to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
                     should allow to get the same result again. (It will be set
                     by YoutubeDL if it's missing)
                     by YoutubeDL if it's missing)

+ 70 - 45
youtube_dl/extractor/zdf.py

@@ -1,75 +1,100 @@
+import operator
 import re
 import re
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
 from ..utils import (
 from ..utils import (
-    determine_ext,
-    ExtractorError,
+    parse_xml_doc,
+    unified_strdate,
 )
 )
 
 
 
 
 class ZDFIE(InfoExtractor):
 class ZDFIE(InfoExtractor):
     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
-    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
 
 
     def _real_extract(self, url):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
         video_id = mobj.group('video_id')
         video_id = mobj.group('video_id')
 
 
-        if mobj.group('hash'):
-            url = url.replace(u'#', u'', 1)
+        xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        info_xml = self._download_webpage(
+            xml_url, video_id, note=u'Downloading video info')
+        doc = parse_xml_doc(info_xml)
 
 
-        html = self._download_webpage(url, video_id)
-        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
-        if streams is None:
-            raise ExtractorError(u'No media url found.')
+        title = doc.find('.//information/title').text
+        description = doc.find('.//information/detail').text
+        uploader_node = doc.find('.//details/originChannelTitle')
+        uploader = None if uploader_node is None else uploader_node.text
+        duration_str = doc.find('.//details/length').text
+        duration_m = re.match(r'''(?x)^
+            (?P<hours>[0-9]{2})
+            :(?P<minutes>[0-9]{2})
+            :(?P<seconds>[0-9]{2})
+            (?:\.(?P<ms>[0-9]+)?)
+            ''', duration_str)
+        duration = (
+            (
+                (int(duration_m.group('hours')) * 60 * 60) +
+                (int(duration_m.group('minutes')) * 60) +
+                int(duration_m.group('seconds'))
+            )
+            if duration_m
+            else None
+        )
+        upload_date = unified_strdate(doc.find('.//details/airtime').text)
 
 
-        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
-        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
-        # choose first/default media type and highest quality for now
-        def stream_pref(s):
-            TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming']
+        def xml_to_format(fnode):
+            video_url = fnode.find('url').text
+            is_available = u'http://www.metafilegenerator' not in video_url
+
+            format_id = fnode.attrib['basetype']
+            format_m = re.match(r'''(?x)
+                (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+                (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+            ''', format_id)
+
+            PROTO_ORDER = ['http', 'rtmp', 'rtsp']
             try:
             try:
-                type_pref = TYPE_ORDER.index(s['media_type'])
+                proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
             except ValueError:
             except ValueError:
-                type_pref = 999
+                proto_pref = 999
 
 
-            QUALITY_ORDER = ['veryhigh', '300']
+            quality = fnode.find('./quality').text
+            QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
             try:
             try:
-                quality_pref = QUALITY_ORDER.index(s['quality'])
+                quality_pref = -QUALITY_ORDER.index(quality)
             except ValueError:
             except ValueError:
                 quality_pref = 999
                 quality_pref = 999
 
 
-            return (type_pref, quality_pref)
-
-        sorted_streams = sorted(streams, key=stream_pref)
-        if not sorted_streams:
-            raise ExtractorError(u'No stream found.')
-        stream = sorted_streams[0]
-
-        media_link = self._download_webpage(
-            stream['video_url'],
-            video_id,
-            u'Get stream URL')
-
-        #MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
-        RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+            abr = int(fnode.find('./audioBitrate').text) // 1000
+            vbr = int(fnode.find('./videoBitrate').text) // 1000
+            pref = (is_available, proto_pref, quality_pref, vbr, abr)
 
 
-        mobj = re.search(self._MEDIA_STREAM, media_link)
-        if mobj is None:
-            mobj = re.search(RTSP_STREAM, media_link)
-            if mobj is None:
-                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
-        video_url = mobj.group('video_url')
+            return {
+                'format_id': format_id,
+                'url': video_url,
+                'ext': format_m.group('container'),
+                'acodec': format_m.group('acodec'),
+                'vcodec': format_m.group('vcodec'),
+                'abr': abr,
+                'vbr': vbr,
+                'width': int(fnode.find('./width').text),
+                'height': int(fnode.find('./height').text),
+                'quality_name': quality,
+                'filesize': int(fnode.find('./filesize').text),
+                'format_note': None if is_available else u'(unavailable)',
+                '_pref': pref,
+            }
 
 
-        title = self._html_search_regex(
-            r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
-            html, u'title')
+        format_nodes = doc.findall('.//formitaeten/formitaet')
+        formats = sorted(map(xml_to_format, format_nodes),
+                         key=operator.itemgetter('_pref'))
 
 
         return {
         return {
             'id': video_id,
             'id': video_id,
-            'url': video_url,
             'title': title,
             'title': title,
-            'ext': determine_ext(video_url)
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+            'duration': duration,
+            'upload_date': upload_date,
         }
         }

+ 21 - 0
youtube_dl/utils.py

@@ -8,6 +8,7 @@ import gzip
 import io
 import io
 import json
 import json
 import locale
 import locale
+import math
 import os
 import os
 import pipes
 import pipes
 import platform
 import platform
@@ -16,6 +17,7 @@ import ssl
 import socket
 import socket
 import sys
 import sys
 import traceback
 import traceback
+import xml.etree.ElementTree
 import zlib
 import zlib
 
 
 try:
 try:
@@ -1006,3 +1008,22 @@ def unsmuggle_url(smug_url):
     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
     data = json.loads(jsond)
     data = json.loads(jsond)
     return url, data
     return url, data
+
+
+def parse_xml_doc(s):
+    assert isinstance(s, type(u''))
+    return xml.etree.ElementTree.fromstring(s.encode('utf-8'))
+
+
+def format_bytes(bytes):
+    if bytes is None:
+        return u'N/A'
+    if type(bytes) is str:
+        bytes = float(bytes)
+    if bytes == 0.0:
+        exponent = 0
+    else:
+        exponent = int(math.log(bytes, 1024.0))
+    suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+    converted = float(bytes) / float(1024 ** exponent)
+    return u'%.2f%s' % (converted, suffix)