2
0

ted.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import (
  7. float_or_none,
  8. int_or_none,
  9. try_get,
  10. url_or_none,
  11. )
  12. class TEDIE(InfoExtractor):
  13. IE_NAME = 'ted'
  14. _VALID_URL = r'''(?x)
  15. (?P<proto>https?://)
  16. (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  17. (
  18. (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  19. |
  20. ((?P<type_talk>talks)) # We have a simple talk
  21. |
  22. (?P<type_watch>watch)/[^/]+/[^/]+
  23. )
  24. (/lang/(.*?))? # The url may contain the language
  25. /(?P<name>[\w-]+) # Here goes the name and then ".html"
  26. .*)$
  27. '''
  28. _TESTS = [{
  29. 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  30. 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
  31. 'info_dict': {
  32. 'id': '102',
  33. 'ext': 'mp4',
  34. 'title': 'The illusion of consciousness',
  35. 'description': ('Philosopher Dan Dennett makes a compelling '
  36. 'argument that not only don\'t we understand our own '
  37. 'consciousness, but that half the time our brains are '
  38. 'actively fooling us.'),
  39. 'uploader': 'Dan Dennett',
  40. 'width': 853,
  41. 'duration': 1308,
  42. 'view_count': int,
  43. 'comment_count': int,
  44. 'tags': list,
  45. },
  46. 'params': {
  47. 'skip_download': True,
  48. },
  49. }, {
  50. # missing HTTP bitrates
  51. 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
  52. 'info_dict': {
  53. 'id': '6069',
  54. 'ext': 'mp4',
  55. 'title': 'The beauty and power of algorithms',
  56. 'thumbnail': r're:^https?://.+\.jpg',
  57. 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
  58. 'uploader': 'Vishal Sikka',
  59. },
  60. 'params': {
  61. 'skip_download': True,
  62. },
  63. }, {
  64. 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  65. 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
  66. 'info_dict': {
  67. 'id': '1972',
  68. 'ext': 'mp4',
  69. 'title': 'Be passionate. Be courageous. Be your best.',
  70. 'uploader': 'Gabby Giffords and Mark Kelly',
  71. 'description': 'md5:5174aed4d0f16021b704120360f72b92',
  72. 'duration': 1128,
  73. },
  74. 'params': {
  75. 'skip_download': True,
  76. },
  77. }, {
  78. 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  79. 'info_dict': {
  80. 'id': '10',
  81. 'title': 'Who are the hackers?',
  82. },
  83. 'playlist_mincount': 6,
  84. }, {
  85. # contains a youtube video
  86. 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  87. 'add_ie': ['Youtube'],
  88. 'info_dict': {
  89. 'id': '_ZG8HBuDjgc',
  90. 'ext': 'webm',
  91. 'title': 'Douglas Adams: Parrots the Universe and Everything',
  92. 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  93. 'uploader': 'University of California Television (UCTV)',
  94. 'uploader_id': 'UCtelevision',
  95. 'upload_date': '20080522',
  96. },
  97. 'params': {
  98. 'skip_download': True,
  99. },
  100. }, {
  101. # no nativeDownloads
  102. 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
  103. 'info_dict': {
  104. 'id': '1792',
  105. 'ext': 'mp4',
  106. 'title': 'The orchestra in my mouth',
  107. 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
  108. 'uploader': 'Tom Thum',
  109. 'view_count': int,
  110. 'comment_count': int,
  111. 'tags': list,
  112. },
  113. 'params': {
  114. 'skip_download': True,
  115. },
  116. }]
  117. _NATIVE_FORMATS = {
  118. 'low': {'width': 320, 'height': 180},
  119. 'medium': {'width': 512, 'height': 288},
  120. 'high': {'width': 854, 'height': 480},
  121. }
  122. def _extract_info(self, webpage):
  123. info_json = self._search_regex(
  124. r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
  125. webpage, 'info json')
  126. return json.loads(info_json)
  127. def _real_extract(self, url):
  128. m = re.match(self._VALID_URL, url, re.VERBOSE)
  129. if m.group('type').startswith('embed'):
  130. desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  131. return self.url_result(desktop_url, 'TED')
  132. name = m.group('name')
  133. if m.group('type_talk'):
  134. return self._talk_info(url, name)
  135. elif m.group('type_watch'):
  136. return self._watch_info(url, name)
  137. else:
  138. return self._playlist_videos_info(url, name)
  139. def _playlist_videos_info(self, url, name):
  140. '''Returns the videos of the playlist'''
  141. webpage = self._download_webpage(url, name,
  142. 'Downloading playlist webpage')
  143. info = self._extract_info(webpage)
  144. playlist_info = try_get(
  145. info, lambda x: x['__INITIAL_DATA__']['playlist'],
  146. dict) or info['playlist']
  147. playlist_entries = [
  148. self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  149. for talk in try_get(
  150. info, lambda x: x['__INITIAL_DATA__']['talks'],
  151. dict) or info['talks']
  152. ]
  153. return self.playlist_result(
  154. playlist_entries,
  155. playlist_id=compat_str(playlist_info['id']),
  156. playlist_title=playlist_info['title'])
  157. def _talk_info(self, url, video_name):
  158. webpage = self._download_webpage(url, video_name)
  159. info = self._extract_info(webpage)
  160. data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
  161. talk_info = data['talks'][0]
  162. title = talk_info['title'].strip()
  163. native_downloads = try_get(
  164. talk_info,
  165. (lambda x: x['downloads']['nativeDownloads'],
  166. lambda x: x['nativeDownloads']),
  167. dict) or {}
  168. formats = [{
  169. 'url': format_url,
  170. 'format_id': format_id,
  171. 'format': format_id,
  172. } for (format_id, format_url) in native_downloads.items() if format_url is not None]
  173. if formats:
  174. for f in formats:
  175. finfo = self._NATIVE_FORMATS.get(f['format_id'])
  176. if finfo:
  177. f.update(finfo)
  178. player_talk = talk_info['player_talks'][0]
  179. external = player_talk.get('external')
  180. if isinstance(external, dict):
  181. service = external.get('service')
  182. if isinstance(service, compat_str):
  183. ext_url = None
  184. if service.lower() == 'youtube':
  185. ext_url = external.get('code')
  186. return self.url_result(ext_url or external['uri'])
  187. resources_ = player_talk.get('resources') or talk_info.get('resources')
  188. http_url = None
  189. for format_id, resources in resources_.items():
  190. if format_id == 'h264':
  191. for resource in resources:
  192. h264_url = resource.get('file')
  193. if not h264_url:
  194. continue
  195. bitrate = int_or_none(resource.get('bitrate'))
  196. formats.append({
  197. 'url': h264_url,
  198. 'format_id': '%s-%sk' % (format_id, bitrate),
  199. 'tbr': bitrate,
  200. })
  201. if re.search(r'\d+k', h264_url):
  202. http_url = h264_url
  203. elif format_id == 'rtmp':
  204. streamer = talk_info.get('streamer')
  205. if not streamer:
  206. continue
  207. for resource in resources:
  208. formats.append({
  209. 'format_id': '%s-%s' % (format_id, resource.get('name')),
  210. 'url': streamer,
  211. 'play_path': resource['file'],
  212. 'ext': 'flv',
  213. 'width': int_or_none(resource.get('width')),
  214. 'height': int_or_none(resource.get('height')),
  215. 'tbr': int_or_none(resource.get('bitrate')),
  216. })
  217. elif format_id == 'hls':
  218. if not isinstance(resources, dict):
  219. continue
  220. stream_url = url_or_none(resources.get('stream'))
  221. if not stream_url:
  222. continue
  223. formats.extend(self._extract_m3u8_formats(
  224. stream_url, video_name, 'mp4', m3u8_id=format_id,
  225. fatal=False))
  226. m3u8_formats = list(filter(
  227. lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
  228. formats))
  229. if http_url:
  230. for m3u8_format in m3u8_formats:
  231. bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
  232. if not bitrate:
  233. continue
  234. bitrate_url = re.sub(r'\d+k', bitrate, http_url)
  235. if not self._is_valid_url(
  236. bitrate_url, video_name, '%s bitrate' % bitrate):
  237. continue
  238. f = m3u8_format.copy()
  239. f.update({
  240. 'url': bitrate_url,
  241. 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
  242. 'protocol': 'http',
  243. })
  244. if f.get('acodec') == 'none':
  245. del f['acodec']
  246. formats.append(f)
  247. audio_download = talk_info.get('audioDownload')
  248. if audio_download:
  249. formats.append({
  250. 'url': audio_download,
  251. 'format_id': 'audio',
  252. 'vcodec': 'none',
  253. })
  254. self._sort_formats(formats)
  255. video_id = compat_str(talk_info['id'])
  256. return {
  257. 'id': video_id,
  258. 'title': title,
  259. 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
  260. 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
  261. 'description': self._og_search_description(webpage),
  262. 'subtitles': self._get_subtitles(video_id, talk_info),
  263. 'formats': formats,
  264. 'duration': float_or_none(talk_info.get('duration')),
  265. 'view_count': int_or_none(data.get('viewed_count')),
  266. 'comment_count': int_or_none(
  267. try_get(data, lambda x: x['comments']['count'])),
  268. 'tags': try_get(talk_info, lambda x: x['tags'], list),
  269. }
  270. def _get_subtitles(self, video_id, talk_info):
  271. sub_lang_list = {}
  272. for language in try_get(
  273. talk_info,
  274. (lambda x: x['downloads']['languages'],
  275. lambda x: x['languages']), list):
  276. lang_code = language.get('languageCode') or language.get('ianaCode')
  277. if not lang_code:
  278. continue
  279. sub_lang_list[lang_code] = [
  280. {
  281. 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
  282. 'ext': ext,
  283. }
  284. for ext in ['ted', 'srt']
  285. ]
  286. return sub_lang_list
  287. def _watch_info(self, url, name):
  288. webpage = self._download_webpage(url, name)
  289. config_json = self._html_search_regex(
  290. r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
  291. webpage, 'config', default=None)
  292. if not config_json:
  293. embed_url = self._search_regex(
  294. r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
  295. return self.url_result(self._proto_relative_url(embed_url))
  296. config = json.loads(config_json)['config']
  297. video_url = config['video']['url']
  298. thumbnail = config.get('image', {}).get('url')
  299. title = self._html_search_regex(
  300. r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
  301. description = self._html_search_regex(
  302. [
  303. r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
  304. r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
  305. ],
  306. webpage, 'description', fatal=False)
  307. return {
  308. 'id': name,
  309. 'url': video_url,
  310. 'title': title,
  311. 'thumbnail': thumbnail,
  312. 'description': description,
  313. }