rtve.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import io
  5. import re
  6. import sys
  7. from .common import InfoExtractor
  8. from ..compat import (
  9. compat_b64decode,
  10. compat_parse_qs,
  11. compat_struct_unpack,
  12. compat_urllib_parse_urlparse,
  13. )
  14. from ..utils import (
  15. determine_ext,
  16. ExtractorError,
  17. float_or_none,
  18. qualities,
  19. remove_end,
  20. remove_start,
  21. std_headers,
  22. )
  23. _bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x))
  24. class RTVEALaCartaIE(InfoExtractor):
  25. IE_NAME = 'rtve.es:play'
  26. IE_DESC = 'RTVE Play'
  27. _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?((alacarta|playz?)/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
  28. _TESTS = [{
  29. 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
  30. 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
  31. 'info_dict': {
  32. 'id': '2491869',
  33. 'ext': 'mp4',
  34. 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
  35. 'duration': 5024.566,
  36. 'series': 'Balonmano',
  37. },
  38. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  39. }, {
  40. 'url': 'http://www.rtve.es/play/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
  41. 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
  42. 'info_dict': {
  43. 'id': '2491869',
  44. 'ext': 'mp4',
  45. 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
  46. 'duration': 5024.566,
  47. 'series': 'Balonmano',
  48. },
  49. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  50. }, {
  51. 'url': 'http://www.rtve.es/playz/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
  52. 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
  53. 'info_dict': {
  54. 'id': '2491869',
  55. 'ext': 'mp4',
  56. 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
  57. 'duration': 5024.566,
  58. 'series': 'Balonmano',
  59. },
  60. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  61. }, {
  62. 'note': 'Live stream',
  63. 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',
  64. 'info_dict': {
  65. 'id': '1694255',
  66. 'ext': 'mp4',
  67. 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  68. 'is_live': True,
  69. },
  70. 'params': {
  71. 'skip_download': 'live stream',
  72. },
  73. }, {
  74. 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/',
  75. 'md5': 'd850f3c8731ea53952ebab489cf81cbf',
  76. 'info_dict': {
  77. 'id': '4236788',
  78. 'ext': 'mp4',
  79. 'title': 'Servir y proteger - Capítulo 104',
  80. 'duration': 3222.0,
  81. },
  82. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  83. }, {
  84. 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
  85. 'only_matching': True,
  86. }, {
  87. 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
  88. 'only_matching': True,
  89. }, {
  90. 'url': 'https://www.rtve.es/play/videos/modulos/capitulos/11332/?currentpage=pf_serie',
  91. 'info_dict': {
  92. 'id': '11332',
  93. },
  94. 'playlist_mincount': 20,
  95. }]
  96. def _real_initialize(self):
  97. user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
  98. self._manager = self._download_json(
  99. 'http://www.rtve.es/odin/loki/' + user_agent_b64,
  100. None, 'Fetching manager info')['manager']
  101. @staticmethod
  102. def _decrypt_url(png):
  103. encrypted_data = io.BytesIO(compat_b64decode(png)[8:])
  104. while True:
  105. length = compat_struct_unpack('!I', encrypted_data.read(4))[0]
  106. chunk_type = encrypted_data.read(4)
  107. if chunk_type == b'IEND':
  108. break
  109. data = encrypted_data.read(length)
  110. if chunk_type == b'tEXt':
  111. alphabet_data, text = data.split(b'\0')
  112. quality, url_data = text.split(b'%%')
  113. alphabet = []
  114. e = 0
  115. d = 0
  116. for l in _bytes_to_chr(alphabet_data):
  117. if d == 0:
  118. alphabet.append(l)
  119. d = e = (e + 1) % 4
  120. else:
  121. d -= 1
  122. url = ''
  123. f = 0
  124. e = 3
  125. b = 1
  126. for letter in _bytes_to_chr(url_data):
  127. if f == 0:
  128. l = int(letter) * 10
  129. f = 1
  130. else:
  131. if e == 0:
  132. l += int(letter)
  133. url += alphabet[l]
  134. e = (b + 3) % 4
  135. f = 0
  136. b += 1
  137. else:
  138. e -= 1
  139. yield quality.decode(), url
  140. encrypted_data.read(4) # CRC
  141. def _extract_png_formats(self, video_id):
  142. png = self._download_webpage(
  143. 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id),
  144. video_id, 'Downloading url information', query={'q': 'v2'})
  145. q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL'])
  146. formats = []
  147. for quality, video_url in self._decrypt_url(png):
  148. ext = determine_ext(video_url)
  149. if ext == 'm3u8':
  150. formats.extend(self._extract_m3u8_formats(
  151. video_url, video_id, 'mp4', 'm3u8_native',
  152. m3u8_id='hls', fatal=False))
  153. elif ext == 'mpd':
  154. formats.extend(self._extract_mpd_formats(
  155. video_url, video_id, 'dash', fatal=False))
  156. else:
  157. formats.append({
  158. 'format_id': quality,
  159. 'quality': q(quality),
  160. 'url': video_url,
  161. })
  162. self._sort_formats(formats)
  163. return formats
  164. def _extract_playlist(self, url, playlist_id):
  165. webpage = self._download_webpage(url, playlist_id)
  166. matches = re.findall(r'''<a\b[^>]*\bhref\s*=\s*["'](%s)''' % (self._VALID_URL, ), webpage)
  167. return self.playlist_from_matches(matches, playlist_id=playlist_id, getter=lambda x: x[0], ie=self.ie_key())
  168. def _real_extract(self, url):
  169. video_id = self._match_id(url)
  170. qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
  171. if 'pf_serie' == qs.get('currentpage', [None])[-1]:
  172. return self._extract_playlist(url, video_id)
  173. info = self._download_json(
  174. 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
  175. video_id)['page']['items'][0]
  176. if info['state'] == 'DESPU':
  177. raise ExtractorError('The video is no longer available', expected=True)
  178. title = info['title'].strip()
  179. formats = self._extract_png_formats(video_id)
  180. subtitles = None
  181. sbt_file = info.get('sbtFile')
  182. if sbt_file:
  183. subtitles = self.extract_subtitles(video_id, sbt_file)
  184. is_live = info.get('live') is True
  185. return {
  186. 'id': video_id,
  187. 'title': self._live_title(title) if is_live else title,
  188. 'formats': formats,
  189. 'thumbnail': info.get('image'),
  190. 'subtitles': subtitles,
  191. 'duration': float_or_none(info.get('duration'), 1000),
  192. 'is_live': is_live,
  193. 'series': info.get('programTitle'),
  194. }
  195. def _get_subtitles(self, video_id, sub_file):
  196. subs = self._download_json(
  197. sub_file + '.json', video_id,
  198. 'Downloading subtitles info')['page']['items']
  199. return dict(
  200. (s['lang'], [{'ext': 'vtt', 'url': s['src']}])
  201. for s in subs)
  202. class RTVEInfantilIE(RTVEALaCartaIE):
  203. IE_NAME = 'rtve.es:infantil'
  204. IE_DESC = 'RTVE infantil'
  205. _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/'
  206. _TESTS = [{
  207. 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/',
  208. 'md5': '5747454717aedf9f9fdf212d1bcfc48d',
  209. 'info_dict': {
  210. 'id': '3040283',
  211. 'ext': 'mp4',
  212. 'title': 'Maneras de vivir',
  213. 'thumbnail': r're:https?://.+/1426182947956\.JPG',
  214. 'duration': 357.958,
  215. },
  216. 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
  217. }]
  218. class RTVELiveIE(RTVEALaCartaIE):
  219. IE_NAME = 'rtve.es:live'
  220. IE_DESC = 'RTVE.es live streams'
  221. _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
  222. _TESTS = [{
  223. 'url': 'http://www.rtve.es/directo/la-1/',
  224. 'info_dict': {
  225. 'id': 'la-1',
  226. 'ext': 'mp4',
  227. 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  228. },
  229. 'params': {
  230. 'skip_download': 'live stream',
  231. }
  232. }]
  233. def _real_extract(self, url):
  234. mobj = re.match(self._VALID_URL, url)
  235. video_id = mobj.group('id')
  236. webpage = self._download_webpage(url, video_id)
  237. title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
  238. title = remove_start(title, 'Estoy viendo ')
  239. vidplayer_id = self._search_regex(
  240. (r'playerId=player([0-9]+)',
  241. r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)',
  242. r'data-id=["\'](\d+)'),
  243. webpage, 'internal video ID')
  244. return {
  245. 'id': video_id,
  246. 'title': self._live_title(title),
  247. 'formats': self._extract_png_formats(vidplayer_id),
  248. 'is_live': True,
  249. }
  250. class RTVETelevisionIE(InfoExtractor):
  251. IE_NAME = 'rtve.es:television'
  252. _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml'
  253. _TEST = {
  254. 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml',
  255. 'info_dict': {
  256. 'id': '3069778',
  257. 'ext': 'mp4',
  258. 'title': 'Documentos TV - La revolución del móvil',
  259. 'duration': 3496.948,
  260. },
  261. 'params': {
  262. 'skip_download': True,
  263. },
  264. }
  265. def _real_extract(self, url):
  266. page_id = self._match_id(url)
  267. webpage = self._download_webpage(url, page_id)
  268. alacarta_url = self._search_regex(
  269. r'data-location="alacarta_videos"[^<]+url&quot;:&quot;(http://www\.rtve\.es/alacarta.+?)&',
  270. webpage, 'alacarta url', default=None)
  271. if alacarta_url is None:
  272. raise ExtractorError(
  273. 'The webpage doesn\'t contain any video', expected=True)
  274. return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key())