ted.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import (
  6. compat_str,
  7. compat_urlparse
  8. )
  9. from ..utils import (
  10. extract_attributes,
  11. float_or_none,
  12. int_or_none,
  13. try_get,
  14. url_or_none,
  15. )
  16. class TEDIE(InfoExtractor):
  17. IE_NAME = 'ted'
  18. _VALID_URL = r'''(?x)
  19. (?P<proto>https?://)
  20. (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  21. (
  22. (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
  23. |
  24. ((?P<type_talk>talks)) # We have a simple talk
  25. |
  26. (?P<type_watch>watch)/[^/]+/[^/]+
  27. )
  28. (/lang/(.*?))? # The url may contain the language
  29. /(?P<name>[\w-]+) # Here goes the name and then ".html"
  30. .*)$
  31. '''
  32. _TESTS = [{
  33. 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  34. 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
  35. 'info_dict': {
  36. 'id': '102',
  37. 'ext': 'mp4',
  38. 'title': 'The illusion of consciousness',
  39. 'description': ('Philosopher Dan Dennett makes a compelling '
  40. 'argument that not only don\'t we understand our own '
  41. 'consciousness, but that half the time our brains are '
  42. 'actively fooling us.'),
  43. 'uploader': 'Dan Dennett',
  44. 'width': 853,
  45. 'duration': 1308,
  46. 'view_count': int,
  47. 'comment_count': int,
  48. 'tags': list,
  49. },
  50. 'params': {
  51. 'skip_download': True,
  52. },
  53. }, {
  54. # missing HTTP bitrates
  55. 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
  56. 'info_dict': {
  57. 'id': '6069',
  58. 'ext': 'mp4',
  59. 'title': 'The beauty and power of algorithms',
  60. 'thumbnail': r're:^https?://.+\.jpg',
  61. 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
  62. 'uploader': 'Vishal Sikka',
  63. },
  64. 'params': {
  65. 'skip_download': True,
  66. },
  67. }, {
  68. 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  69. 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
  70. 'info_dict': {
  71. 'id': '1972',
  72. 'ext': 'mp4',
  73. 'title': 'Be passionate. Be courageous. Be your best.',
  74. 'uploader': 'Gabby Giffords and Mark Kelly',
  75. 'description': 'md5:5174aed4d0f16021b704120360f72b92',
  76. 'duration': 1128,
  77. },
  78. 'params': {
  79. 'skip_download': True,
  80. },
  81. }, {
  82. 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  83. 'info_dict': {
  84. 'id': '10',
  85. 'title': 'Who are the hackers?',
  86. 'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
  87. },
  88. 'playlist_mincount': 6,
  89. }, {
  90. # contains a youtube video
  91. 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  92. 'add_ie': ['Youtube'],
  93. 'info_dict': {
  94. 'id': '_ZG8HBuDjgc',
  95. 'ext': 'webm',
  96. 'title': 'Douglas Adams: Parrots the Universe and Everything',
  97. 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  98. 'uploader': 'University of California Television (UCTV)',
  99. 'uploader_id': 'UCtelevision',
  100. 'upload_date': '20080522',
  101. },
  102. 'params': {
  103. 'skip_download': True,
  104. },
  105. }, {
  106. # no nativeDownloads
  107. 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
  108. 'info_dict': {
  109. 'id': '1792',
  110. 'ext': 'mp4',
  111. 'title': 'The orchestra in my mouth',
  112. 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
  113. 'uploader': 'Tom Thum',
  114. 'view_count': int,
  115. 'comment_count': int,
  116. 'tags': list,
  117. },
  118. 'params': {
  119. 'skip_download': True,
  120. },
  121. }]
  122. _NATIVE_FORMATS = {
  123. 'low': {'width': 320, 'height': 180},
  124. 'medium': {'width': 512, 'height': 288},
  125. 'high': {'width': 854, 'height': 480},
  126. }
  127. def _extract_info(self, webpage):
  128. info_json = self._search_regex(
  129. r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
  130. webpage, 'info json')
  131. return json.loads(info_json)
  132. def _real_extract(self, url):
  133. m = re.match(self._VALID_URL, url, re.VERBOSE)
  134. if m.group('type').startswith('embed'):
  135. desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  136. return self.url_result(desktop_url, 'TED')
  137. name = m.group('name')
  138. if m.group('type_talk'):
  139. return self._talk_info(url, name)
  140. elif m.group('type_watch'):
  141. return self._watch_info(url, name)
  142. else:
  143. return self._playlist_videos_info(url, name)
  144. def _playlist_videos_info(self, url, name):
  145. '''Returns the videos of the playlist'''
  146. webpage = self._download_webpage(url, name,
  147. 'Downloading playlist webpage')
  148. playlist_entries = []
  149. for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
  150. attrs = extract_attributes(entry)
  151. entry_url = compat_urlparse.urljoin(url, attrs['href'])
  152. playlist_entries.append(self.url_result(entry_url, self.ie_key()))
  153. final_url = self._og_search_url(webpage, fatal=False)
  154. playlist_id = (
  155. re.match(self._VALID_URL, final_url).group('playlist_id')
  156. if final_url else None)
  157. return self.playlist_result(
  158. playlist_entries, playlist_id=playlist_id,
  159. playlist_title=self._og_search_title(webpage, fatal=False),
  160. playlist_description=self._og_search_description(webpage))
  161. def _talk_info(self, url, video_name):
  162. webpage = self._download_webpage(url, video_name)
  163. info = self._extract_info(webpage)
  164. data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
  165. talk_info = data['talks'][0]
  166. title = talk_info['title'].strip()
  167. native_downloads = try_get(
  168. talk_info,
  169. (lambda x: x['downloads']['nativeDownloads'],
  170. lambda x: x['nativeDownloads']),
  171. dict) or {}
  172. formats = [{
  173. 'url': format_url,
  174. 'format_id': format_id,
  175. 'format': format_id,
  176. } for (format_id, format_url) in native_downloads.items() if format_url is not None]
  177. if formats:
  178. for f in formats:
  179. finfo = self._NATIVE_FORMATS.get(f['format_id'])
  180. if finfo:
  181. f.update(finfo)
  182. player_talk = talk_info['player_talks'][0]
  183. external = player_talk.get('external')
  184. if isinstance(external, dict):
  185. service = external.get('service')
  186. if isinstance(service, compat_str):
  187. ext_url = None
  188. if service.lower() == 'youtube':
  189. ext_url = external.get('code')
  190. return self.url_result(ext_url or external['uri'])
  191. resources_ = player_talk.get('resources') or talk_info.get('resources')
  192. http_url = None
  193. for format_id, resources in resources_.items():
  194. if format_id == 'h264':
  195. for resource in resources:
  196. h264_url = resource.get('file')
  197. if not h264_url:
  198. continue
  199. bitrate = int_or_none(resource.get('bitrate'))
  200. formats.append({
  201. 'url': h264_url,
  202. 'format_id': '%s-%sk' % (format_id, bitrate),
  203. 'tbr': bitrate,
  204. })
  205. if re.search(r'\d+k', h264_url):
  206. http_url = h264_url
  207. elif format_id == 'rtmp':
  208. streamer = talk_info.get('streamer')
  209. if not streamer:
  210. continue
  211. for resource in resources:
  212. formats.append({
  213. 'format_id': '%s-%s' % (format_id, resource.get('name')),
  214. 'url': streamer,
  215. 'play_path': resource['file'],
  216. 'ext': 'flv',
  217. 'width': int_or_none(resource.get('width')),
  218. 'height': int_or_none(resource.get('height')),
  219. 'tbr': int_or_none(resource.get('bitrate')),
  220. })
  221. elif format_id == 'hls':
  222. if not isinstance(resources, dict):
  223. continue
  224. stream_url = url_or_none(resources.get('stream'))
  225. if not stream_url:
  226. continue
  227. formats.extend(self._extract_m3u8_formats(
  228. stream_url, video_name, 'mp4', m3u8_id=format_id,
  229. fatal=False))
  230. m3u8_formats = list(filter(
  231. lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
  232. formats))
  233. if http_url:
  234. for m3u8_format in m3u8_formats:
  235. bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
  236. if not bitrate:
  237. continue
  238. bitrate_url = re.sub(r'\d+k', bitrate, http_url)
  239. if not self._is_valid_url(
  240. bitrate_url, video_name, '%s bitrate' % bitrate):
  241. continue
  242. f = m3u8_format.copy()
  243. f.update({
  244. 'url': bitrate_url,
  245. 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
  246. 'protocol': 'http',
  247. })
  248. if f.get('acodec') == 'none':
  249. del f['acodec']
  250. formats.append(f)
  251. audio_download = talk_info.get('audioDownload')
  252. if audio_download:
  253. formats.append({
  254. 'url': audio_download,
  255. 'format_id': 'audio',
  256. 'vcodec': 'none',
  257. })
  258. self._sort_formats(formats)
  259. video_id = compat_str(talk_info['id'])
  260. return {
  261. 'id': video_id,
  262. 'title': title,
  263. 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
  264. 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
  265. 'description': self._og_search_description(webpage),
  266. 'subtitles': self._get_subtitles(video_id, talk_info),
  267. 'formats': formats,
  268. 'duration': float_or_none(talk_info.get('duration')),
  269. 'view_count': int_or_none(data.get('viewed_count')),
  270. 'comment_count': int_or_none(
  271. try_get(data, lambda x: x['comments']['count'])),
  272. 'tags': try_get(talk_info, lambda x: x['tags'], list),
  273. }
  274. def _get_subtitles(self, video_id, talk_info):
  275. sub_lang_list = {}
  276. for language in try_get(
  277. talk_info,
  278. (lambda x: x['downloads']['languages'],
  279. lambda x: x['languages']), list):
  280. lang_code = language.get('languageCode') or language.get('ianaCode')
  281. if not lang_code:
  282. continue
  283. sub_lang_list[lang_code] = [
  284. {
  285. 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
  286. 'ext': ext,
  287. }
  288. for ext in ['ted', 'srt']
  289. ]
  290. return sub_lang_list
  291. def _watch_info(self, url, name):
  292. webpage = self._download_webpage(url, name)
  293. config_json = self._html_search_regex(
  294. r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
  295. webpage, 'config', default=None)
  296. if not config_json:
  297. embed_url = self._search_regex(
  298. r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
  299. return self.url_result(self._proto_relative_url(embed_url))
  300. config = json.loads(config_json)['config']
  301. video_url = config['video']['url']
  302. thumbnail = config.get('image', {}).get('url')
  303. title = self._html_search_regex(
  304. r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
  305. description = self._html_search_regex(
  306. [
  307. r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
  308. r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
  309. ],
  310. webpage, 'description', fatal=False)
  311. return {
  312. 'id': name,
  313. 'url': video_url,
  314. 'title': title,
  315. 'thumbnail': thumbnail,
  316. 'description': description,
  317. }