ted.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. from __future__ import unicode_literals
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import int_or_none
  7. class TEDIE(InfoExtractor):
  8. IE_NAME = 'ted'
  9. _VALID_URL = r'''(?x)
  10. (?P<proto>https?://)
  11. (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  12. (
  13. (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  14. |
  15. ((?P<type_talk>talks)) # We have a simple talk
  16. |
  17. (?P<type_watch>watch)/[^/]+/[^/]+
  18. )
  19. (/lang/(.*?))? # The url may contain the language
  20. /(?P<name>[\w-]+) # Here goes the name and then ".html"
  21. .*)$
  22. '''
  23. _TESTS = [{
  24. 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  25. 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
  26. 'info_dict': {
  27. 'id': '102',
  28. 'ext': 'mp4',
  29. 'title': 'The illusion of consciousness',
  30. 'description': ('Philosopher Dan Dennett makes a compelling '
  31. 'argument that not only don\'t we understand our own '
  32. 'consciousness, but that half the time our brains are '
  33. 'actively fooling us.'),
  34. 'uploader': 'Dan Dennett',
  35. 'width': 853,
  36. 'duration': 1308,
  37. }
  38. }, {
  39. 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  40. 'md5': 'b899ac15e345fb39534d913f7606082b',
  41. 'info_dict': {
  42. 'id': 'tSVI8ta_P4w',
  43. 'ext': 'mp4',
  44. 'title': 'Vishal Sikka: The beauty and power of algorithms',
  45. 'thumbnail': r're:^https?://.+\.jpg',
  46. 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
  47. 'upload_date': '20140122',
  48. 'uploader_id': 'TEDInstitute',
  49. 'uploader': 'TED Institute',
  50. },
  51. 'add_ie': ['Youtube'],
  52. }, {
  53. 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  54. 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
  55. 'info_dict': {
  56. 'id': '1972',
  57. 'ext': 'mp4',
  58. 'title': 'Be passionate. Be courageous. Be your best.',
  59. 'uploader': 'Gabby Giffords and Mark Kelly',
  60. 'description': 'md5:5174aed4d0f16021b704120360f72b92',
  61. 'duration': 1128,
  62. },
  63. }, {
  64. 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  65. 'info_dict': {
  66. 'id': '10',
  67. 'title': 'Who are the hackers?',
  68. },
  69. 'playlist_mincount': 6,
  70. }, {
  71. # contains a youtube video
  72. 'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  73. 'add_ie': ['Youtube'],
  74. 'info_dict': {
  75. 'id': '_ZG8HBuDjgc',
  76. 'ext': 'webm',
  77. 'title': 'Douglas Adams: Parrots the Universe and Everything',
  78. 'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  79. 'uploader': 'University of California Television (UCTV)',
  80. 'uploader_id': 'UCtelevision',
  81. 'upload_date': '20080522',
  82. },
  83. 'params': {
  84. 'skip_download': True,
  85. },
  86. }, {
  87. # YouTube video
  88. 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
  89. 'add_ie': ['Youtube'],
  90. 'info_dict': {
  91. 'id': 'aFBIPO-P7LM',
  92. 'ext': 'mp4',
  93. 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
  94. 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
  95. 'uploader': 'TEDx Talks',
  96. 'uploader_id': 'TEDxTalks',
  97. 'upload_date': '20111216',
  98. },
  99. 'params': {
  100. 'skip_download': True,
  101. },
  102. }]
  103. _NATIVE_FORMATS = {
  104. 'low': {'width': 320, 'height': 180},
  105. 'medium': {'width': 512, 'height': 288},
  106. 'high': {'width': 854, 'height': 480},
  107. }
  108. def _extract_info(self, webpage):
  109. info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
  110. webpage, 'info json')
  111. return json.loads(info_json)
  112. def _real_extract(self, url):
  113. m = re.match(self._VALID_URL, url, re.VERBOSE)
  114. if m.group('type').startswith('embed'):
  115. desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  116. return self.url_result(desktop_url, 'TED')
  117. name = m.group('name')
  118. if m.group('type_talk'):
  119. return self._talk_info(url, name)
  120. elif m.group('type_watch'):
  121. return self._watch_info(url, name)
  122. else:
  123. return self._playlist_videos_info(url, name)
  124. def _playlist_videos_info(self, url, name):
  125. '''Returns the videos of the playlist'''
  126. webpage = self._download_webpage(url, name,
  127. 'Downloading playlist webpage')
  128. info = self._extract_info(webpage)
  129. playlist_info = info['playlist']
  130. playlist_entries = [
  131. self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  132. for talk in info['talks']
  133. ]
  134. return self.playlist_result(
  135. playlist_entries,
  136. playlist_id=compat_str(playlist_info['id']),
  137. playlist_title=playlist_info['title'])
  138. def _talk_info(self, url, video_name):
  139. webpage = self._download_webpage(url, video_name)
  140. self.report_extraction(video_name)
  141. talk_info = self._extract_info(webpage)['talks'][0]
  142. external = talk_info.get('external')
  143. if external:
  144. service = external['service']
  145. self.to_screen('Found video from %s' % service)
  146. ext_url = None
  147. if service.lower() == 'youtube':
  148. ext_url = external.get('code')
  149. return {
  150. '_type': 'url',
  151. 'url': ext_url or external['uri'],
  152. }
  153. formats = [{
  154. 'url': format_url,
  155. 'format_id': format_id,
  156. 'format': format_id,
  157. } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
  158. if formats:
  159. for f in formats:
  160. finfo = self._NATIVE_FORMATS.get(f['format_id'])
  161. if finfo:
  162. f.update(finfo)
  163. http_url = None
  164. for format_id, resources in talk_info['resources'].items():
  165. if format_id == 'h264':
  166. for resource in resources:
  167. h264_url = resource.get('file')
  168. if not h264_url:
  169. continue
  170. bitrate = int_or_none(resource.get('bitrate'))
  171. formats.append({
  172. 'url': h264_url,
  173. 'format_id': '%s-%sk' % (format_id, bitrate),
  174. 'tbr': bitrate,
  175. })
  176. if re.search(r'\d+k', h264_url):
  177. http_url = h264_url
  178. elif format_id == 'rtmp':
  179. streamer = talk_info.get('streamer')
  180. if not streamer:
  181. continue
  182. for resource in resources:
  183. formats.append({
  184. 'format_id': '%s-%s' % (format_id, resource.get('name')),
  185. 'url': streamer,
  186. 'play_path': resource['file'],
  187. 'ext': 'flv',
  188. 'width': int_or_none(resource.get('width')),
  189. 'height': int_or_none(resource.get('height')),
  190. 'tbr': int_or_none(resource.get('bitrate')),
  191. })
  192. elif format_id == 'hls':
  193. formats.extend(self._extract_m3u8_formats(
  194. resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
  195. m3u8_formats = list(filter(
  196. lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
  197. formats))
  198. if http_url:
  199. for m3u8_format in m3u8_formats:
  200. bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
  201. if not bitrate:
  202. continue
  203. f = m3u8_format.copy()
  204. f.update({
  205. 'url': re.sub(r'\d+k', bitrate, http_url),
  206. 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
  207. 'protocol': 'http',
  208. })
  209. formats.append(f)
  210. audio_download = talk_info.get('audioDownload')
  211. if audio_download:
  212. formats.append({
  213. 'url': audio_download,
  214. 'format_id': 'audio',
  215. 'vcodec': 'none',
  216. })
  217. self._sort_formats(formats)
  218. video_id = compat_str(talk_info['id'])
  219. thumbnail = talk_info['thumb']
  220. if not thumbnail.startswith('http'):
  221. thumbnail = 'http://' + thumbnail
  222. return {
  223. 'id': video_id,
  224. 'title': talk_info['title'].strip(),
  225. 'uploader': talk_info['speaker'],
  226. 'thumbnail': thumbnail,
  227. 'description': self._og_search_description(webpage),
  228. 'subtitles': self._get_subtitles(video_id, talk_info),
  229. 'formats': formats,
  230. 'duration': talk_info.get('duration'),
  231. }
  232. def _get_subtitles(self, video_id, talk_info):
  233. languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
  234. if languages:
  235. sub_lang_list = {}
  236. for l in languages:
  237. sub_lang_list[l] = [
  238. {
  239. 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
  240. 'ext': ext,
  241. }
  242. for ext in ['ted', 'srt']
  243. ]
  244. return sub_lang_list
  245. else:
  246. return {}
  247. def _watch_info(self, url, name):
  248. webpage = self._download_webpage(url, name)
  249. config_json = self._html_search_regex(
  250. r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
  251. webpage, 'config', default=None)
  252. if not config_json:
  253. embed_url = self._search_regex(
  254. r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
  255. return self.url_result(self._proto_relative_url(embed_url))
  256. config = json.loads(config_json)['config']
  257. video_url = config['video']['url']
  258. thumbnail = config.get('image', {}).get('url')
  259. title = self._html_search_regex(
  260. r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
  261. description = self._html_search_regex(
  262. [
  263. r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
  264. r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
  265. ],
  266. webpage, 'description', fatal=False)
  267. return {
  268. 'id': name,
  269. 'url': video_url,
  270. 'title': title,
  271. 'thumbnail': thumbnail,
  272. 'description': description,
  273. }