ted.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import (
  7. int_or_none,
  8. try_get,
  9. )
  10. class TEDIE(InfoExtractor):
  11. IE_NAME = 'ted'
  12. _VALID_URL = r'''(?x)
  13. (?P<proto>https?://)
  14. (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  15. (
  16. (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  17. |
  18. ((?P<type_talk>talks)) # We have a simple talk
  19. |
  20. (?P<type_watch>watch)/[^/]+/[^/]+
  21. )
  22. (/lang/(.*?))? # The url may contain the language
  23. /(?P<name>[\w-]+) # Here goes the name and then ".html"
  24. .*)$
  25. '''
  26. _TESTS = [{
  27. 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  28. 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
  29. 'info_dict': {
  30. 'id': '102',
  31. 'ext': 'mp4',
  32. 'title': 'The illusion of consciousness',
  33. 'description': ('Philosopher Dan Dennett makes a compelling '
  34. 'argument that not only don\'t we understand our own '
  35. 'consciousness, but that half the time our brains are '
  36. 'actively fooling us.'),
  37. 'uploader': 'Dan Dennett',
  38. 'width': 853,
  39. 'duration': 1308,
  40. }
  41. }, {
  42. 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  43. 'md5': 'b899ac15e345fb39534d913f7606082b',
  44. 'info_dict': {
  45. 'id': 'tSVI8ta_P4w',
  46. 'ext': 'mp4',
  47. 'title': 'Vishal Sikka: The beauty and power of algorithms',
  48. 'thumbnail': r're:^https?://.+\.jpg',
  49. 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
  50. 'upload_date': '20140122',
  51. 'uploader_id': 'TEDInstitute',
  52. 'uploader': 'TED Institute',
  53. },
  54. 'add_ie': ['Youtube'],
  55. }, {
  56. 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  57. 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
  58. 'info_dict': {
  59. 'id': '1972',
  60. 'ext': 'mp4',
  61. 'title': 'Be passionate. Be courageous. Be your best.',
  62. 'uploader': 'Gabby Giffords and Mark Kelly',
  63. 'description': 'md5:5174aed4d0f16021b704120360f72b92',
  64. 'duration': 1128,
  65. },
  66. }, {
  67. 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  68. 'info_dict': {
  69. 'id': '10',
  70. 'title': 'Who are the hackers?',
  71. },
  72. 'playlist_mincount': 6,
  73. }, {
  74. # contains a youtube video
  75. 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  76. 'add_ie': ['Youtube'],
  77. 'info_dict': {
  78. 'id': '_ZG8HBuDjgc',
  79. 'ext': 'webm',
  80. 'title': 'Douglas Adams: Parrots the Universe and Everything',
  81. 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  82. 'uploader': 'University of California Television (UCTV)',
  83. 'uploader_id': 'UCtelevision',
  84. 'upload_date': '20080522',
  85. },
  86. 'params': {
  87. 'skip_download': True,
  88. },
  89. }, {
  90. # YouTube video
  91. 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
  92. 'add_ie': ['Youtube'],
  93. 'info_dict': {
  94. 'id': 'aFBIPO-P7LM',
  95. 'ext': 'mp4',
  96. 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
  97. 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
  98. 'uploader': 'TEDx Talks',
  99. 'uploader_id': 'TEDxTalks',
  100. 'upload_date': '20111216',
  101. },
  102. 'params': {
  103. 'skip_download': True,
  104. },
  105. }, {
  106. # no nativeDownloads
  107. 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
  108. 'info_dict': {
  109. 'id': '1792',
  110. 'ext': 'mp4',
  111. 'title': 'The orchestra in my mouth',
  112. 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
  113. 'uploader': 'Tom Thum',
  114. },
  115. 'params': {
  116. 'skip_download': True,
  117. },
  118. }]
  119. _NATIVE_FORMATS = {
  120. 'low': {'width': 320, 'height': 180},
  121. 'medium': {'width': 512, 'height': 288},
  122. 'high': {'width': 854, 'height': 480},
  123. }
  124. def _extract_info(self, webpage):
  125. info_json = self._search_regex(
  126. r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
  127. webpage, 'info json')
  128. return json.loads(info_json)
  129. def _real_extract(self, url):
  130. m = re.match(self._VALID_URL, url, re.VERBOSE)
  131. if m.group('type').startswith('embed'):
  132. desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  133. return self.url_result(desktop_url, 'TED')
  134. name = m.group('name')
  135. if m.group('type_talk'):
  136. return self._talk_info(url, name)
  137. elif m.group('type_watch'):
  138. return self._watch_info(url, name)
  139. else:
  140. return self._playlist_videos_info(url, name)
  141. def _playlist_videos_info(self, url, name):
  142. '''Returns the videos of the playlist'''
  143. webpage = self._download_webpage(url, name,
  144. 'Downloading playlist webpage')
  145. info = self._extract_info(webpage)
  146. playlist_info = try_get(
  147. info, lambda x: x['__INITIAL_DATA__']['playlist'],
  148. dict) or info['playlist']
  149. playlist_entries = [
  150. self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  151. for talk in try_get(
  152. info, lambda x: x['__INITIAL_DATA__']['talks'],
  153. dict) or info['talks']
  154. ]
  155. return self.playlist_result(
  156. playlist_entries,
  157. playlist_id=compat_str(playlist_info['id']),
  158. playlist_title=playlist_info['title'])
  159. def _talk_info(self, url, video_name):
  160. webpage = self._download_webpage(url, video_name)
  161. info = self._extract_info(webpage)
  162. talk_info = try_get(
  163. info, lambda x: x['__INITIAL_DATA__']['talks'][0],
  164. dict) or info['talks'][0]
  165. title = talk_info['title'].strip()
  166. external = talk_info.get('external')
  167. if external:
  168. service = external['service']
  169. self.to_screen('Found video from %s' % service)
  170. ext_url = None
  171. if service.lower() == 'youtube':
  172. ext_url = external.get('code')
  173. return {
  174. '_type': 'url',
  175. 'url': ext_url or external['uri'],
  176. }
  177. native_downloads = try_get(
  178. talk_info,
  179. (lambda x: x['downloads']['nativeDownloads'],
  180. lambda x: x['nativeDownloads']),
  181. dict) or {}
  182. formats = [{
  183. 'url': format_url,
  184. 'format_id': format_id,
  185. 'format': format_id,
  186. } for (format_id, format_url) in native_downloads.items() if format_url is not None]
  187. if formats:
  188. for f in formats:
  189. finfo = self._NATIVE_FORMATS.get(f['format_id'])
  190. if finfo:
  191. f.update(finfo)
  192. player_talk = talk_info['player_talks'][0]
  193. resources_ = player_talk.get('resources') or talk_info.get('resources')
  194. http_url = None
  195. for format_id, resources in resources_.items():
  196. if format_id == 'h264':
  197. for resource in resources:
  198. h264_url = resource.get('file')
  199. if not h264_url:
  200. continue
  201. bitrate = int_or_none(resource.get('bitrate'))
  202. formats.append({
  203. 'url': h264_url,
  204. 'format_id': '%s-%sk' % (format_id, bitrate),
  205. 'tbr': bitrate,
  206. })
  207. if re.search(r'\d+k', h264_url):
  208. http_url = h264_url
  209. elif format_id == 'rtmp':
  210. streamer = talk_info.get('streamer')
  211. if not streamer:
  212. continue
  213. for resource in resources:
  214. formats.append({
  215. 'format_id': '%s-%s' % (format_id, resource.get('name')),
  216. 'url': streamer,
  217. 'play_path': resource['file'],
  218. 'ext': 'flv',
  219. 'width': int_or_none(resource.get('width')),
  220. 'height': int_or_none(resource.get('height')),
  221. 'tbr': int_or_none(resource.get('bitrate')),
  222. })
  223. elif format_id == 'hls':
  224. formats.extend(self._extract_m3u8_formats(
  225. resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
  226. m3u8_formats = list(filter(
  227. lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
  228. formats))
  229. if http_url:
  230. for m3u8_format in m3u8_formats:
  231. bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
  232. if not bitrate:
  233. continue
  234. f = m3u8_format.copy()
  235. f.update({
  236. 'url': re.sub(r'\d+k', bitrate, http_url),
  237. 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
  238. 'protocol': 'http',
  239. })
  240. formats.append(f)
  241. audio_download = talk_info.get('audioDownload')
  242. if audio_download:
  243. formats.append({
  244. 'url': audio_download,
  245. 'format_id': 'audio',
  246. 'vcodec': 'none',
  247. })
  248. self._sort_formats(formats)
  249. video_id = compat_str(talk_info['id'])
  250. return {
  251. 'id': video_id,
  252. 'title': title,
  253. 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
  254. 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
  255. 'description': self._og_search_description(webpage),
  256. 'subtitles': self._get_subtitles(video_id, talk_info),
  257. 'formats': formats,
  258. 'duration': talk_info.get('duration'),
  259. }
  260. def _get_subtitles(self, video_id, talk_info):
  261. sub_lang_list = {}
  262. for language in try_get(
  263. talk_info,
  264. (lambda x: x['downloads']['languages'],
  265. lambda x: x['languages']), list):
  266. lang_code = language.get('languageCode') or language.get('ianaCode')
  267. if not lang_code:
  268. continue
  269. sub_lang_list[lang_code] = [
  270. {
  271. 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
  272. 'ext': ext,
  273. }
  274. for ext in ['ted', 'srt']
  275. ]
  276. return sub_lang_list
  277. def _watch_info(self, url, name):
  278. webpage = self._download_webpage(url, name)
  279. config_json = self._html_search_regex(
  280. r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
  281. webpage, 'config', default=None)
  282. if not config_json:
  283. embed_url = self._search_regex(
  284. r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
  285. return self.url_result(self._proto_relative_url(embed_url))
  286. config = json.loads(config_json)['config']
  287. video_url = config['video']['url']
  288. thumbnail = config.get('image', {}).get('url')
  289. title = self._html_search_regex(
  290. r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
  291. description = self._html_search_regex(
  292. [
  293. r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
  294. r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
  295. ],
  296. webpage, 'description', fatal=False)
  297. return {
  298. 'id': name,
  299. 'url': video_url,
  300. 'title': title,
  301. 'thumbnail': thumbnail,
  302. 'description': description,
  303. }