canvas.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .common import InfoExtractor
  5. from .gigya import GigyaBaseIE
  6. from ..compat import compat_HTTPError
  7. from ..utils import (
  8. extract_attributes,
  9. ExtractorError,
  10. strip_or_none,
  11. float_or_none,
  12. int_or_none,
  13. merge_dicts,
  14. str_or_none,
  15. url_or_none,
  16. )
  17. class CanvasIE(InfoExtractor):
  18. _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
  19. _TESTS = [{
  20. 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  21. 'md5': '68993eda72ef62386a15ea2cf3c93107',
  22. 'info_dict': {
  23. 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  24. 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  25. 'ext': 'mp4',
  26. 'title': 'Nachtwacht: De Greystook',
  27. 'description': 'Nachtwacht: De Greystook',
  28. 'thumbnail': r're:^https?://.*\.jpg$',
  29. 'duration': 1468.04,
  30. },
  31. 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
  32. }, {
  33. 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  34. 'only_matching': True,
  35. }]
  36. _GEO_BYPASS = False
  37. _HLS_ENTRY_PROTOCOLS_MAP = {
  38. 'HLS': 'm3u8_native',
  39. 'HLS_AES': 'm3u8',
  40. }
  41. _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
  42. def _real_extract(self, url):
  43. mobj = re.match(self._VALID_URL, url)
  44. site_id, video_id = mobj.group('site_id'), mobj.group('id')
  45. data = None
  46. if site_id != 'vrtvideo':
  47. # Old API endpoint, serves more formats but may fail for some videos
  48. data = self._download_json(
  49. 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
  50. % (site_id, video_id), video_id, 'Downloading asset JSON',
  51. 'Unable to download asset JSON', fatal=False)
  52. # New API endpoint
  53. if not data:
  54. headers = self.geo_verification_headers()
  55. headers.update({'Content-Type': 'application/json'})
  56. token = self._download_json(
  57. '%s/tokens' % self._REST_API_BASE, video_id,
  58. 'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
  59. data = self._download_json(
  60. '%s/videos/%s' % (self._REST_API_BASE, video_id),
  61. video_id, 'Downloading video JSON', query={
  62. 'vrtPlayerToken': token,
  63. 'client': '%s@PROD' % site_id,
  64. }, expected_status=400)
  65. if not data.get('title'):
  66. code = data.get('code')
  67. if code == 'AUTHENTICATION_REQUIRED':
  68. self.raise_login_required()
  69. elif code == 'INVALID_LOCATION':
  70. self.raise_geo_restricted(countries=['BE'])
  71. raise ExtractorError(data.get('message') or code, expected=True)
  72. title = data['title']
  73. description = data.get('description')
  74. formats = []
  75. for target in data['targetUrls']:
  76. format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
  77. if not format_url or not format_type:
  78. continue
  79. format_type = format_type.upper()
  80. if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
  81. formats.extend(self._extract_m3u8_formats(
  82. format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
  83. m3u8_id=format_type, fatal=False))
  84. elif format_type == 'HDS':
  85. formats.extend(self._extract_f4m_formats(
  86. format_url, video_id, f4m_id=format_type, fatal=False))
  87. elif format_type == 'MPEG_DASH':
  88. formats.extend(self._extract_mpd_formats(
  89. format_url, video_id, mpd_id=format_type, fatal=False))
  90. elif format_type == 'HSS':
  91. formats.extend(self._extract_ism_formats(
  92. format_url, video_id, ism_id='mss', fatal=False))
  93. else:
  94. formats.append({
  95. 'format_id': format_type,
  96. 'url': format_url,
  97. })
  98. self._sort_formats(formats)
  99. subtitles = {}
  100. subtitle_urls = data.get('subtitleUrls')
  101. if isinstance(subtitle_urls, list):
  102. for subtitle in subtitle_urls:
  103. subtitle_url = subtitle.get('url')
  104. if subtitle_url and subtitle.get('type') == 'CLOSED':
  105. subtitles.setdefault('nl', []).append({'url': subtitle_url})
  106. return {
  107. 'id': video_id,
  108. 'display_id': video_id,
  109. 'title': title,
  110. 'description': description,
  111. 'formats': formats,
  112. 'duration': float_or_none(data.get('duration'), 1000),
  113. 'thumbnail': data.get('posterImageUrl'),
  114. 'subtitles': subtitles,
  115. }
  116. class CanvasEenIE(InfoExtractor):
  117. IE_DESC = 'canvas.be and een.be'
  118. _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  119. _TESTS = [{
  120. 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
  121. 'md5': 'ed66976748d12350b118455979cca293',
  122. 'info_dict': {
  123. 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  124. 'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
  125. 'ext': 'flv',
  126. 'title': 'De afspraak veilt voor de Warmste Week',
  127. 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
  128. 'thumbnail': r're:^https?://.*\.jpg$',
  129. 'duration': 49.02,
  130. },
  131. 'expected_warnings': ['is not a supported codec'],
  132. }, {
  133. # with subtitles
  134. 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
  135. 'info_dict': {
  136. 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
  137. 'display_id': 'pieter-0167',
  138. 'ext': 'mp4',
  139. 'title': 'Pieter 0167',
  140. 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
  141. 'thumbnail': r're:^https?://.*\.jpg$',
  142. 'duration': 2553.08,
  143. 'subtitles': {
  144. 'nl': [{
  145. 'ext': 'vtt',
  146. }],
  147. },
  148. },
  149. 'params': {
  150. 'skip_download': True,
  151. },
  152. 'skip': 'Pagina niet gevonden',
  153. }, {
  154. 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
  155. 'info_dict': {
  156. 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
  157. 'display_id': 'emma-pakt-thilly-aan',
  158. 'ext': 'mp4',
  159. 'title': 'Emma pakt Thilly aan',
  160. 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
  161. 'thumbnail': r're:^https?://.*\.jpg$',
  162. 'duration': 118.24,
  163. },
  164. 'params': {
  165. 'skip_download': True,
  166. },
  167. 'expected_warnings': ['is not a supported codec'],
  168. }, {
  169. 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
  170. 'only_matching': True,
  171. }]
  172. def _real_extract(self, url):
  173. mobj = re.match(self._VALID_URL, url)
  174. site_id, display_id = mobj.group('site_id'), mobj.group('id')
  175. webpage = self._download_webpage(url, display_id)
  176. title = strip_or_none(self._search_regex(
  177. r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
  178. webpage, 'title', default=None) or self._og_search_title(
  179. webpage, default=None))
  180. video_id = self._html_search_regex(
  181. r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
  182. group='id')
  183. return {
  184. '_type': 'url_transparent',
  185. 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
  186. 'ie_key': CanvasIE.ie_key(),
  187. 'id': video_id,
  188. 'display_id': display_id,
  189. 'title': title,
  190. 'description': self._og_search_description(webpage),
  191. }
  192. class VrtNUIE(GigyaBaseIE):
  193. IE_DESC = 'VrtNU.be'
  194. _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  195. _TESTS = [{
  196. # Available via old API endpoint
  197. 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
  198. 'info_dict': {
  199. 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
  200. 'ext': 'mp4',
  201. 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
  202. 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
  203. 'duration': 1457.04,
  204. 'thumbnail': r're:^https?://.*\.jpg$',
  205. 'series': 'Postbus X',
  206. 'season': 'Seizoen 1989',
  207. 'season_number': 1989,
  208. 'episode': 'De zwarte weduwe',
  209. 'episode_number': 1,
  210. 'timestamp': 1595822400,
  211. 'upload_date': '20200727',
  212. },
  213. 'skip': 'This video is only available for registered users',
  214. 'params': {
  215. 'username': '<snip>',
  216. 'password': '<snip>',
  217. },
  218. 'expected_warnings': ['is not a supported codec'],
  219. }, {
  220. # Only available via new API endpoint
  221. 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
  222. 'info_dict': {
  223. 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
  224. 'ext': 'mp4',
  225. 'title': 'Aflevering 5',
  226. 'description': 'Wie valt door de mand tijdens een missie?',
  227. 'duration': 2967.06,
  228. 'season': 'Season 1',
  229. 'season_number': 1,
  230. 'episode_number': 5,
  231. },
  232. 'skip': 'This video is only available for registered users',
  233. 'params': {
  234. 'username': '<snip>',
  235. 'password': '<snip>',
  236. },
  237. 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
  238. }]
  239. _NETRC_MACHINE = 'vrtnu'
  240. _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
  241. _CONTEXT_ID = 'R3595707040'
  242. def _real_initialize(self):
  243. self._login()
  244. def _login(self):
  245. username, password = self._get_login_info()
  246. if username is None:
  247. return
  248. auth_data = {
  249. 'APIKey': self._APIKEY,
  250. 'targetEnv': 'jssdk',
  251. 'loginID': username,
  252. 'password': password,
  253. 'authMode': 'cookie',
  254. }
  255. auth_info = self._gigya_login(auth_data)
  256. # Sometimes authentication fails for no good reason, retry
  257. login_attempt = 1
  258. while login_attempt <= 3:
  259. try:
  260. # When requesting a token, no actual token is returned, but the
  261. # necessary cookies are set.
  262. self._request_webpage(
  263. 'https://token.vrt.be',
  264. None, note='Requesting a token', errnote='Could not get a token',
  265. headers={
  266. 'Content-Type': 'application/json',
  267. 'Referer': 'https://www.vrt.be/vrtnu/',
  268. },
  269. data=json.dumps({
  270. 'uid': auth_info['UID'],
  271. 'uidsig': auth_info['UIDSignature'],
  272. 'ts': auth_info['signatureTimestamp'],
  273. 'email': auth_info['profile']['email'],
  274. }).encode('utf-8'))
  275. except ExtractorError as e:
  276. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  277. login_attempt += 1
  278. self.report_warning('Authentication failed')
  279. self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
  280. else:
  281. raise e
  282. else:
  283. break
  284. def _real_extract(self, url):
  285. display_id = self._match_id(url)
  286. webpage = self._download_webpage(url, display_id)
  287. attrs = extract_attributes(self._search_regex(
  288. r'(<nui-media[^>]+>)', webpage, 'media element'))
  289. video_id = attrs['videoid']
  290. publication_id = attrs.get('publicationid')
  291. if publication_id:
  292. video_id = publication_id + '$' + video_id
  293. page = (self._parse_json(self._search_regex(
  294. r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
  295. default='{}'), video_id, fatal=False) or {}).get('page') or {}
  296. info = self._search_json_ld(webpage, display_id, default={})
  297. return merge_dicts(info, {
  298. '_type': 'url_transparent',
  299. 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
  300. 'ie_key': CanvasIE.ie_key(),
  301. 'id': video_id,
  302. 'display_id': display_id,
  303. 'season_number': int_or_none(page.get('episode_season')),
  304. })