ted.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import (
  7. int_or_none,
  8. try_get,
  9. )
  10. class TEDIE(InfoExtractor):
  11. IE_NAME = 'ted'
  12. _VALID_URL = r'''(?x)
  13. (?P<proto>https?://)
  14. (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  15. (
  16. (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  17. |
  18. ((?P<type_talk>talks)) # We have a simple talk
  19. |
  20. (?P<type_watch>watch)/[^/]+/[^/]+
  21. )
  22. (/lang/(.*?))? # The url may contain the language
  23. /(?P<name>[\w-]+) # Here goes the name and then ".html"
  24. .*)$
  25. '''
  26. _TESTS = [{
  27. 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  28. 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
  29. 'info_dict': {
  30. 'id': '102',
  31. 'ext': 'mp4',
  32. 'title': 'The illusion of consciousness',
  33. 'description': ('Philosopher Dan Dennett makes a compelling '
  34. 'argument that not only don\'t we understand our own '
  35. 'consciousness, but that half the time our brains are '
  36. 'actively fooling us.'),
  37. 'uploader': 'Dan Dennett',
  38. 'width': 853,
  39. 'duration': 1308,
  40. }
  41. }, {
  42. 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  43. 'md5': 'b899ac15e345fb39534d913f7606082b',
  44. 'info_dict': {
  45. 'id': 'tSVI8ta_P4w',
  46. 'ext': 'mp4',
  47. 'title': 'Vishal Sikka: The beauty and power of algorithms',
  48. 'thumbnail': r're:^https?://.+\.jpg',
  49. 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
  50. 'upload_date': '20140122',
  51. 'uploader_id': 'TEDInstitute',
  52. 'uploader': 'TED Institute',
  53. },
  54. 'add_ie': ['Youtube'],
  55. }, {
  56. 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  57. 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
  58. 'info_dict': {
  59. 'id': '1972',
  60. 'ext': 'mp4',
  61. 'title': 'Be passionate. Be courageous. Be your best.',
  62. 'uploader': 'Gabby Giffords and Mark Kelly',
  63. 'description': 'md5:5174aed4d0f16021b704120360f72b92',
  64. 'duration': 1128,
  65. },
  66. }, {
  67. 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  68. 'info_dict': {
  69. 'id': '10',
  70. 'title': 'Who are the hackers?',
  71. },
  72. 'playlist_mincount': 6,
  73. }, {
  74. # contains a youtube video
  75. 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  76. 'add_ie': ['Youtube'],
  77. 'info_dict': {
  78. 'id': '_ZG8HBuDjgc',
  79. 'ext': 'webm',
  80. 'title': 'Douglas Adams: Parrots the Universe and Everything',
  81. 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  82. 'uploader': 'University of California Television (UCTV)',
  83. 'uploader_id': 'UCtelevision',
  84. 'upload_date': '20080522',
  85. },
  86. 'params': {
  87. 'skip_download': True,
  88. },
  89. }, {
  90. # YouTube video
  91. 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
  92. 'add_ie': ['Youtube'],
  93. 'info_dict': {
  94. 'id': 'aFBIPO-P7LM',
  95. 'ext': 'mp4',
  96. 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
  97. 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
  98. 'uploader': 'TEDx Talks',
  99. 'uploader_id': 'TEDxTalks',
  100. 'upload_date': '20111216',
  101. },
  102. 'params': {
  103. 'skip_download': True,
  104. },
  105. }]
  106. _NATIVE_FORMATS = {
  107. 'low': {'width': 320, 'height': 180},
  108. 'medium': {'width': 512, 'height': 288},
  109. 'high': {'width': 854, 'height': 480},
  110. }
  111. def _extract_info(self, webpage):
  112. info_json = self._search_regex(
  113. r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
  114. webpage, 'info json')
  115. return json.loads(info_json)
  116. def _real_extract(self, url):
  117. m = re.match(self._VALID_URL, url, re.VERBOSE)
  118. if m.group('type').startswith('embed'):
  119. desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  120. return self.url_result(desktop_url, 'TED')
  121. name = m.group('name')
  122. if m.group('type_talk'):
  123. return self._talk_info(url, name)
  124. elif m.group('type_watch'):
  125. return self._watch_info(url, name)
  126. else:
  127. return self._playlist_videos_info(url, name)
  128. def _playlist_videos_info(self, url, name):
  129. '''Returns the videos of the playlist'''
  130. webpage = self._download_webpage(url, name,
  131. 'Downloading playlist webpage')
  132. info = self._extract_info(webpage)
  133. playlist_info = try_get(
  134. info, lambda x: x['__INITIAL_DATA__']['playlist'],
  135. dict) or info['playlist']
  136. playlist_entries = [
  137. self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  138. for talk in try_get(
  139. info, lambda x: x['__INITIAL_DATA__']['talks'],
  140. dict) or info['talks']
  141. ]
  142. return self.playlist_result(
  143. playlist_entries,
  144. playlist_id=compat_str(playlist_info['id']),
  145. playlist_title=playlist_info['title'])
  146. def _talk_info(self, url, video_name):
  147. webpage = self._download_webpage(url, video_name)
  148. info = self._extract_info(webpage)
  149. talk_info = try_get(
  150. info, lambda x: x['__INITIAL_DATA__']['talks'][0],
  151. dict) or info['talks'][0]
  152. title = talk_info['title'].strip()
  153. external = talk_info.get('external')
  154. if external:
  155. service = external['service']
  156. self.to_screen('Found video from %s' % service)
  157. ext_url = None
  158. if service.lower() == 'youtube':
  159. ext_url = external.get('code')
  160. return {
  161. '_type': 'url',
  162. 'url': ext_url or external['uri'],
  163. }
  164. native_downloads = try_get(
  165. talk_info, lambda x: x['downloads']['nativeDownloads'],
  166. dict) or talk_info['nativeDownloads']
  167. formats = [{
  168. 'url': format_url,
  169. 'format_id': format_id,
  170. 'format': format_id,
  171. } for (format_id, format_url) in native_downloads.items() if format_url is not None]
  172. if formats:
  173. for f in formats:
  174. finfo = self._NATIVE_FORMATS.get(f['format_id'])
  175. if finfo:
  176. f.update(finfo)
  177. player_talk = talk_info['player_talks'][0]
  178. resources_ = player_talk.get('resources') or talk_info.get('resources')
  179. http_url = None
  180. for format_id, resources in resources_.items():
  181. if format_id == 'h264':
  182. for resource in resources:
  183. h264_url = resource.get('file')
  184. if not h264_url:
  185. continue
  186. bitrate = int_or_none(resource.get('bitrate'))
  187. formats.append({
  188. 'url': h264_url,
  189. 'format_id': '%s-%sk' % (format_id, bitrate),
  190. 'tbr': bitrate,
  191. })
  192. if re.search(r'\d+k', h264_url):
  193. http_url = h264_url
  194. elif format_id == 'rtmp':
  195. streamer = talk_info.get('streamer')
  196. if not streamer:
  197. continue
  198. for resource in resources:
  199. formats.append({
  200. 'format_id': '%s-%s' % (format_id, resource.get('name')),
  201. 'url': streamer,
  202. 'play_path': resource['file'],
  203. 'ext': 'flv',
  204. 'width': int_or_none(resource.get('width')),
  205. 'height': int_or_none(resource.get('height')),
  206. 'tbr': int_or_none(resource.get('bitrate')),
  207. })
  208. elif format_id == 'hls':
  209. formats.extend(self._extract_m3u8_formats(
  210. resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
  211. m3u8_formats = list(filter(
  212. lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
  213. formats))
  214. if http_url:
  215. for m3u8_format in m3u8_formats:
  216. bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
  217. if not bitrate:
  218. continue
  219. f = m3u8_format.copy()
  220. f.update({
  221. 'url': re.sub(r'\d+k', bitrate, http_url),
  222. 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
  223. 'protocol': 'http',
  224. })
  225. formats.append(f)
  226. audio_download = talk_info.get('audioDownload')
  227. if audio_download:
  228. formats.append({
  229. 'url': audio_download,
  230. 'format_id': 'audio',
  231. 'vcodec': 'none',
  232. })
  233. self._sort_formats(formats)
  234. video_id = compat_str(talk_info['id'])
  235. return {
  236. 'id': video_id,
  237. 'title': title,
  238. 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
  239. 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
  240. 'description': self._og_search_description(webpage),
  241. 'subtitles': self._get_subtitles(video_id, talk_info),
  242. 'formats': formats,
  243. 'duration': talk_info.get('duration'),
  244. }
  245. def _get_subtitles(self, video_id, talk_info):
  246. sub_lang_list = {}
  247. for language in try_get(
  248. talk_info,
  249. (lambda x: x['downloads']['languages'],
  250. lambda x: x['languages']), list):
  251. lang_code = language.get('languageCode') or language.get('ianaCode')
  252. if not lang_code:
  253. continue
  254. sub_lang_list[lang_code] = [
  255. {
  256. 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
  257. 'ext': ext,
  258. }
  259. for ext in ['ted', 'srt']
  260. ]
  261. return sub_lang_list
  262. def _watch_info(self, url, name):
  263. webpage = self._download_webpage(url, name)
  264. config_json = self._html_search_regex(
  265. r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
  266. webpage, 'config', default=None)
  267. if not config_json:
  268. embed_url = self._search_regex(
  269. r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
  270. return self.url_result(self._proto_relative_url(embed_url))
  271. config = json.loads(config_json)['config']
  272. video_url = config['video']['url']
  273. thumbnail = config.get('image', {}).get('url')
  274. title = self._html_search_regex(
  275. r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
  276. description = self._html_search_regex(
  277. [
  278. r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
  279. r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
  280. ],
  281. webpage, 'description', fatal=False)
  282. return {
  283. 'id': name,
  284. 'url': video_url,
  285. 'title': title,
  286. 'thumbnail': thumbnail,
  287. 'description': description,
  288. }