bbc.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. from __future__ import unicode_literals
  2. import xml.etree.ElementTree
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. parse_duration,
  7. int_or_none,
  8. )
  9. from ..compat import compat_HTTPError
  10. import re
  11. class BBCCoUkIE(InfoExtractor):
  12. IE_NAME = 'bbc.co.uk'
  13. IE_DESC = 'BBC iPlayer'
  14. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
  15. mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
  16. _TESTS = [
  17. {
  18. 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  19. 'info_dict': {
  20. 'id': 'b039d07m',
  21. 'ext': 'flv',
  22. 'title': 'Kaleidoscope, Leonard Cohen',
  23. 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  24. 'duration': 1740,
  25. },
  26. 'params': {
  27. # rtmp download
  28. 'skip_download': True,
  29. }
  30. },
  31. {
  32. 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  33. 'info_dict': {
  34. 'id': 'b00yng1d',
  35. 'ext': 'flv',
  36. 'title': 'The Man in Black: Series 3: The Printed Name',
  37. 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  38. 'duration': 1800,
  39. },
  40. 'params': {
  41. # rtmp download
  42. 'skip_download': True,
  43. },
  44. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  45. },
  46. {
  47. 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  48. 'info_dict': {
  49. 'id': 'b00yng1d',
  50. 'ext': 'flv',
  51. 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  52. 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
  53. 'duration': 5100,
  54. },
  55. 'params': {
  56. # rtmp download
  57. 'skip_download': True,
  58. },
  59. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  60. },
  61. {
  62. 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
  63. 'info_dict': {
  64. 'id': 'b03k3pb7',
  65. 'ext': 'flv',
  66. 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
  67. 'description': '2. Invasion',
  68. 'duration': 3600,
  69. },
  70. 'params': {
  71. # rtmp download
  72. 'skip_download': True,
  73. },
  74. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  75. }, {
  76. 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
  77. 'info_dict': {
  78. 'id': 'b04v209v',
  79. 'ext': 'flv',
  80. 'title': 'Pete Tong, The Essential New Tune Special',
  81. 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
  82. 'duration': 10800,
  83. },
  84. 'params': {
  85. # rtmp download
  86. 'skip_download': True,
  87. }
  88. }, {
  89. 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
  90. 'note': 'Audio',
  91. 'info_dict': {
  92. 'id': 'p02frcch',
  93. 'ext': 'flv',
  94. 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
  95. 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
  96. 'duration': 3507,
  97. },
  98. 'params': {
  99. # rtmp download
  100. 'skip_download': True,
  101. }
  102. }, {
  103. 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
  104. 'note': 'Video',
  105. 'info_dict': {
  106. 'id': 'p025c103',
  107. 'ext': 'flv',
  108. 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
  109. 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
  110. 'duration': 226,
  111. },
  112. 'params': {
  113. # rtmp download
  114. 'skip_download': True,
  115. }
  116. }, {
  117. 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
  118. 'info_dict': {
  119. 'id': 'p02n76xf',
  120. 'ext': 'flv',
  121. 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
  122. 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
  123. 'duration': 3540,
  124. },
  125. 'params': {
  126. # rtmp download
  127. 'skip_download': True,
  128. },
  129. 'skip': 'geolocation',
  130. }, {
  131. 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
  132. 'info_dict': {
  133. 'id': 'b05zmgw1',
  134. 'ext': 'flv',
  135. 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
  136. 'title': 'Royal Academy Summer Exhibition',
  137. 'duration': 3540,
  138. },
  139. 'params': {
  140. # rtmp download
  141. 'skip_download': True,
  142. },
  143. 'skip': 'geolocation',
  144. }, {
  145. 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
  146. 'only_matching': True,
  147. }, {
  148. 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
  149. 'only_matching': True,
  150. }, {
  151. 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
  152. 'only_matching': True,
  153. }
  154. ]
  155. def _extract_asx_playlist(self, connection, programme_id):
  156. asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
  157. return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  158. def _extract_connection(self, connection, programme_id):
  159. formats = []
  160. protocol = connection.get('protocol')
  161. supplier = connection.get('supplier')
  162. if protocol == 'http':
  163. href = connection.get('href')
  164. # ASX playlist
  165. if supplier == 'asx':
  166. for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
  167. formats.append({
  168. 'url': ref,
  169. 'format_id': 'ref%s_%s' % (i, supplier),
  170. })
  171. # Direct link
  172. else:
  173. formats.append({
  174. 'url': href,
  175. 'format_id': supplier,
  176. })
  177. elif protocol == 'rtmp':
  178. application = connection.get('application', 'ondemand')
  179. auth_string = connection.get('authString')
  180. identifier = connection.get('identifier')
  181. server = connection.get('server')
  182. formats.append({
  183. 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
  184. 'play_path': identifier,
  185. 'app': '%s?%s' % (application, auth_string),
  186. 'page_url': 'http://www.bbc.co.uk',
  187. 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
  188. 'rtmp_live': False,
  189. 'ext': 'flv',
  190. 'format_id': supplier,
  191. })
  192. return formats
  193. def _extract_items(self, playlist):
  194. return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
  195. def _extract_medias(self, media_selection):
  196. error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
  197. if error is not None:
  198. raise ExtractorError(
  199. '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
  200. return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
  201. def _extract_connections(self, media):
  202. return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
  203. def _extract_video(self, media, programme_id):
  204. formats = []
  205. vbr = int(media.get('bitrate'))
  206. vcodec = media.get('encoding')
  207. service = media.get('service')
  208. width = int(media.get('width'))
  209. height = int(media.get('height'))
  210. file_size = int(media.get('media_file_size'))
  211. for connection in self._extract_connections(media):
  212. conn_formats = self._extract_connection(connection, programme_id)
  213. for format in conn_formats:
  214. format.update({
  215. 'format_id': '%s_%s' % (service, format['format_id']),
  216. 'width': width,
  217. 'height': height,
  218. 'vbr': vbr,
  219. 'vcodec': vcodec,
  220. 'filesize': file_size,
  221. })
  222. formats.extend(conn_formats)
  223. return formats
  224. def _extract_audio(self, media, programme_id):
  225. formats = []
  226. abr = int(media.get('bitrate'))
  227. acodec = media.get('encoding')
  228. service = media.get('service')
  229. for connection in self._extract_connections(media):
  230. conn_formats = self._extract_connection(connection, programme_id)
  231. for format in conn_formats:
  232. format.update({
  233. 'format_id': '%s_%s' % (service, format['format_id']),
  234. 'abr': abr,
  235. 'acodec': acodec,
  236. })
  237. formats.extend(conn_formats)
  238. return formats
  239. def _get_subtitles(self, media, programme_id):
  240. subtitles = {}
  241. for connection in self._extract_connections(media):
  242. captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
  243. lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
  244. ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
  245. srt = ''
  246. def _extract_text(p):
  247. if p.text is not None:
  248. stripped_text = p.text.strip()
  249. if stripped_text:
  250. return stripped_text
  251. return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
  252. for pos, p in enumerate(ps):
  253. srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
  254. subtitles[lang] = [
  255. {
  256. 'url': connection.get('href'),
  257. 'ext': 'ttml',
  258. },
  259. {
  260. 'data': srt,
  261. 'ext': 'srt',
  262. },
  263. ]
  264. return subtitles
  265. def _download_media_selector(self, programme_id):
  266. try:
  267. media_selection = self._download_xml(
  268. self.mediaselector_url % programme_id,
  269. programme_id, 'Downloading media selection XML')
  270. except ExtractorError as ee:
  271. if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
  272. media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
  273. else:
  274. raise
  275. formats = []
  276. subtitles = None
  277. for media in self._extract_medias(media_selection):
  278. kind = media.get('kind')
  279. if kind == 'audio':
  280. formats.extend(self._extract_audio(media, programme_id))
  281. elif kind == 'video':
  282. formats.extend(self._extract_video(media, programme_id))
  283. elif kind == 'captions':
  284. subtitles = self.extract_subtitles(media, programme_id)
  285. return formats, subtitles
  286. def _download_playlist(self, playlist_id):
  287. try:
  288. playlist = self._download_json(
  289. 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
  290. playlist_id, 'Downloading playlist JSON')
  291. version = playlist.get('defaultAvailableVersion')
  292. if version:
  293. smp_config = version['smpConfig']
  294. title = smp_config['title']
  295. description = smp_config['summary']
  296. for item in smp_config['items']:
  297. kind = item['kind']
  298. if kind != 'programme' and kind != 'radioProgramme':
  299. continue
  300. programme_id = item.get('vpid')
  301. duration = int(item.get('duration'))
  302. formats, subtitles = self._download_media_selector(programme_id)
  303. return programme_id, title, description, duration, formats, subtitles
  304. except ExtractorError as ee:
  305. if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
  306. raise
  307. # fallback to legacy playlist
  308. playlist = self._download_xml(
  309. 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
  310. playlist_id, 'Downloading legacy playlist XML')
  311. no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
  312. if no_items is not None:
  313. reason = no_items.get('reason')
  314. if reason == 'preAvailability':
  315. msg = 'Episode %s is not yet available' % playlist_id
  316. elif reason == 'postAvailability':
  317. msg = 'Episode %s is no longer available' % playlist_id
  318. elif reason == 'noMedia':
  319. msg = 'Episode %s is not currently available' % playlist_id
  320. else:
  321. msg = 'Episode %s is not available: %s' % (playlist_id, reason)
  322. raise ExtractorError(msg, expected=True)
  323. for item in self._extract_items(playlist):
  324. kind = item.get('kind')
  325. if kind != 'programme' and kind != 'radioProgramme':
  326. continue
  327. title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
  328. description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
  329. programme_id = item.get('identifier')
  330. duration = int(item.get('duration'))
  331. formats, subtitles = self._download_media_selector(programme_id)
  332. return programme_id, title, description, duration, formats, subtitles
  333. def _real_extract(self, url):
  334. group_id = self._match_id(url)
  335. webpage = self._download_webpage(url, group_id, 'Downloading video page')
  336. programme_id = None
  337. tviplayer = self._search_regex(
  338. r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
  339. webpage, 'player', default=None)
  340. if tviplayer:
  341. player = self._parse_json(tviplayer, group_id).get('player', {})
  342. duration = int_or_none(player.get('duration'))
  343. programme_id = player.get('vpid')
  344. if not programme_id:
  345. programme_id = self._search_regex(
  346. r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
  347. if programme_id:
  348. formats, subtitles = self._download_media_selector(programme_id)
  349. title = self._og_search_title(webpage)
  350. description = self._search_regex(
  351. r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
  352. webpage, 'description', fatal=False)
  353. else:
  354. programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  355. self._sort_formats(formats)
  356. return {
  357. 'id': programme_id,
  358. 'title': title,
  359. 'description': description,
  360. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  361. 'duration': duration,
  362. 'formats': formats,
  363. 'subtitles': subtitles,
  364. }
  365. class BBCNewsIE(BBCCoUkIE):
  366. IE_NAME = 'bbc.com'
  367. IE_DESC = 'BBC news'
  368. _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P<id>[^/]+)$'
  369. mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
  370. _TESTS = [{
  371. 'url': 'http://www.bbc.com/news/world-europe-32668511',
  372. 'info_dict': {
  373. 'id': 'world-europe-32668511',
  374. 'title': 'Russia stages massive WW2 parade despite Western boycott',
  375. },
  376. 'playlist_count': 2,
  377. },{
  378. 'url': 'http://www.bbc.com/news/business-28299555',
  379. 'info_dict': {
  380. 'id': 'business-28299555',
  381. 'title': 'Farnborough Airshow: Video highlights',
  382. },
  383. 'playlist_count': 9,
  384. },{
  385. 'url': 'http://www.bbc.com/news/world-europe-32041533',
  386. 'note': 'Video',
  387. 'info_dict': {
  388. 'id': 'p02mprgb',
  389. 'ext': 'mp4',
  390. 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
  391. 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
  392. 'duration': 47,
  393. },
  394. 'params': {
  395. 'skip_download': True,
  396. }
  397. },{
  398. 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
  399. 'note': 'Video',
  400. 'info_dict': {
  401. 'id': 'NA',
  402. 'ext': 'mp4',
  403. 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde',
  404. 'duration': 47,
  405. },
  406. 'params': {
  407. 'skip_download': True,
  408. }
  409. },{
  410. 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
  411. 'note': 'Video',
  412. 'info_dict': {
  413. 'id': '39275083',
  414. 'ext': 'mp4',
  415. 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n',
  416. 'duration': 87,
  417. },
  418. 'params': {
  419. 'skip_download': True,
  420. }
  421. }]
  422. def _real_extract(self, url):
  423. list_id = self._match_id(url)
  424. webpage = self._download_webpage(url, list_id)
  425. list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'list title')
  426. pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
  427. if pubdate:
  428. pubdate = pubdate.replace('-','')
  429. ret = []
  430. jsent = []
  431. # works with bbc.com/news/something-something-123456 articles
  432. jsent = map(
  433. lambda m: self._parse_json(m,list_id),
  434. re.findall(r"data-media-meta='({[^']+})'", webpage)
  435. )
  436. if len(jsent) == 0:
  437. # http://www.bbc.com/news/video_and_audio/international
  438. # and single-video articles
  439. masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None)
  440. if masset:
  441. jmasset = self._parse_json(masset,list_id)
  442. for key, val in jmasset.get('videos',{}).items():
  443. for skey, sval in val.items():
  444. sval['id'] = skey
  445. jsent.append(sval)
  446. if len(jsent) == 0:
  447. # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
  448. # in http://www.bbc.com/news/video_and_audio/international
  449. # prone to breaking if entries have sourceFiles list
  450. jsent = map(
  451. lambda m: self._parse_json(m,list_id),
  452. re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
  453. )
  454. if len(jsent) == 0:
  455. raise ExtractorError('No video found', expected=True)
  456. for jent in jsent:
  457. programme_id = jent.get('externalId')
  458. xml_url = jent.get('hxref')
  459. title = jent.get('caption',list_title)
  460. duration = parse_duration(jent.get('duration'))
  461. description = list_title + ' - ' + jent.get('caption','')
  462. thumbnail = None
  463. if jent.has_key('image'):
  464. thumbnail=jent['image'].get('href')
  465. formats = []
  466. subtitles = []
  467. if programme_id:
  468. formats, subtitles = self._download_media_selector(programme_id)
  469. elif jent.has_key('sourceFiles'):
  470. # mediaselector not used at
  471. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu
  472. for key, val in jent['sourceFiles'].items():
  473. formats.append( {
  474. 'ext': val.get('encoding'),
  475. 'url': val.get('url'),
  476. 'filesize': int(val.get('filesize')),
  477. 'format_id': key
  478. } )
  479. elif xml_url:
  480. # Cheap fallback
  481. # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
  482. xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
  483. programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
  484. formats, subtitles = self._download_media_selector(programme_id)
  485. if len(formats) == 0:
  486. raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n')
  487. self._sort_formats(formats)
  488. ret.append( {
  489. 'id': jent.get('programme_id',jent.get('id')),
  490. 'uploader': 'BBC News',
  491. 'upload_date': pubdate,
  492. 'title': title,
  493. 'description': description,
  494. 'thumbnail': thumbnail,
  495. 'duration': duration,
  496. 'formats': formats,
  497. 'subtitles': subtitles,
  498. } )
  499. if len(ret) > 0:
  500. return self.playlist_result(ret, list_id, list_title)
  501. raise ExtractorError('No video found', expected=True)