zdf.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. int_or_none,
  7. unified_strdate,
  8. )
  9. def extract_from_xml_url(ie, video_id, xml_url):
  10. doc = ie._download_xml(
  11. xml_url, video_id,
  12. note='Downloading video info',
  13. errnote='Failed to download video info')
  14. title = doc.find('.//information/title').text
  15. description = doc.find('.//information/detail').text
  16. duration = int(doc.find('.//details/lengthSec').text)
  17. uploader_node = doc.find('.//details/originChannelTitle')
  18. uploader = None if uploader_node is None else uploader_node.text
  19. uploader_id_node = doc.find('.//details/originChannelId')
  20. uploader_id = None if uploader_id_node is None else uploader_id_node.text
  21. upload_date = unified_strdate(doc.find('.//details/airtime').text)
  22. def xml_to_format(fnode):
  23. video_url = fnode.find('url').text
  24. is_available = 'http://www.metafilegenerator' not in video_url
  25. format_id = fnode.attrib['basetype']
  26. format_m = re.match(r'''(?x)
  27. (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
  28. (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
  29. ''', format_id)
  30. ext = format_m.group('container')
  31. proto = format_m.group('proto').lower()
  32. quality = fnode.find('./quality').text
  33. abr = int(fnode.find('./audioBitrate').text) // 1000
  34. vbr_node = fnode.find('./videoBitrate')
  35. vbr = None if vbr_node is None else int(vbr_node.text) // 1000
  36. width_node = fnode.find('./width')
  37. width = None if width_node is None else int_or_none(width_node.text)
  38. height_node = fnode.find('./height')
  39. height = None if height_node is None else int_or_none(height_node.text)
  40. format_note = ''
  41. if not format_note:
  42. format_note = None
  43. return {
  44. 'format_id': format_id + '-' + quality,
  45. 'url': video_url,
  46. 'ext': ext,
  47. 'acodec': format_m.group('acodec'),
  48. 'vcodec': format_m.group('vcodec'),
  49. 'abr': abr,
  50. 'vbr': vbr,
  51. 'width': width,
  52. 'height': height,
  53. 'filesize': int_or_none(fnode.find('./filesize').text),
  54. 'format_note': format_note,
  55. 'protocol': proto,
  56. '_available': is_available,
  57. }
  58. format_nodes = doc.findall('.//formitaeten/formitaet')
  59. formats = list(filter(
  60. lambda f: f['_available'],
  61. map(xml_to_format, format_nodes)))
  62. ie._sort_formats(formats)
  63. return {
  64. 'id': video_id,
  65. 'title': title,
  66. 'description': description,
  67. 'duration': duration,
  68. 'uploader': uploader,
  69. 'uploader_id': uploader_id,
  70. 'upload_date': upload_date,
  71. 'formats': formats,
  72. }
  73. def extract_channel_from_xml_url(ie, channel_id, xml_url):
  74. doc = ie._download_xml(
  75. xml_url, channel_id,
  76. note='Downloading channel info',
  77. errnote='Failed to download channel info')
  78. title = doc.find('.//information/title').text
  79. description = doc.find('.//information/detail').text
  80. assets = [{'id': asset.find('./details/assetId').text,
  81. 'type': asset.find('./type').text,
  82. } for asset in doc.findall('.//teasers/teaser')]
  83. return {
  84. 'id': channel_id,
  85. 'title': title,
  86. 'description': description,
  87. 'assets': assets,
  88. }
  89. class ZDFIE(InfoExtractor):
  90. _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
  91. _TEST = {
  92. 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
  93. 'info_dict': {
  94. 'id': '2037704',
  95. 'ext': 'webm',
  96. 'title': 'ZDFspezial - Ende des Machtpokers',
  97. 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".',
  98. 'duration': 1022,
  99. 'uploader': 'spezial',
  100. 'uploader_id': '225948',
  101. 'upload_date': '20131127',
  102. },
  103. 'skip': 'Videos on ZDF.de are depublicised in short order',
  104. }
  105. def _extract_video(self, video_id):
  106. xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
  107. return extract_from_xml_url(self, video_id, xml_url)
  108. def _real_extract(self, url):
  109. return self._extract_video(self._match_id(url))
  110. class ZDFChannelIE(ZDFIE):
  111. _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*kanaluebersicht/)(?P<id>[0-9]+)'
  112. _TEST = {
  113. 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic',
  114. 'info_dict': {
  115. 'id': '1586442',
  116. 'title': 'Titanic',
  117. 'description': 'md5:444c048cfe3fdc2561be7de4bcbf1d04',
  118. },
  119. 'playlist_count': 3,
  120. }
  121. def _extract_channel(self, channel_id):
  122. def load_chunks(channel_id, chunk_length):
  123. offset = 0
  124. while True:
  125. url = ('http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s'
  126. % (offset, chunk_length, channel_id))
  127. result = extract_channel_from_xml_url(self, channel_id, url)
  128. yield result
  129. if len(result['assets']) < chunk_length:
  130. return
  131. offset += chunk_length
  132. def load_channel(channel_id):
  133. chunks = list(load_chunks(channel_id, 50)) # The server rejects higher values
  134. assets = [asset for chunk in chunks for asset in chunk['assets']]
  135. video_ids = [asset['id'] for asset in
  136. filter(lambda asset: asset['type'] == 'video',
  137. assets)]
  138. topic_ids = [asset['id'] for asset in
  139. filter(lambda asset: asset['type'] == 'thema',
  140. assets)]
  141. if topic_ids:
  142. video_ids = reduce(list.__add__,
  143. [load_channel(topic_id)['video_ids']
  144. for topic_id in topic_ids],
  145. video_ids)
  146. result = chunks[0]
  147. result['video_ids'] = video_ids
  148. return result
  149. channel = load_channel(channel_id)
  150. return {
  151. '_type': 'playlist',
  152. 'id': channel['id'],
  153. 'title': channel['title'],
  154. 'description': channel['description'],
  155. 'entries': [self._extract_video(video_id)
  156. for video_id in channel['video_ids']],
  157. }
  158. def _real_extract(self, url):
  159. return self._extract_channel(self._match_id(url))