channel9.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. unescapeHTML,
  7. int_or_none,
  8. parse_iso8601,
  9. clean_html,
  10. qualities,
  11. )
  12. class Channel9IE(InfoExtractor):
  13. '''
  14. Common extractor for channel9.msdn.com.
  15. The type of provided URL (video or playlist) is determined according to
  16. meta Search.PageType from web page HTML rather than URL itself, as it is
  17. not always possible to do.
  18. '''
  19. IE_DESC = 'Channel 9'
  20. IE_NAME = 'channel9'
  21. _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
  22. _TESTS = [{
  23. 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  24. 'md5': '32083d4eaf1946db6d454313f44510ca',
  25. 'info_dict': {
  26. 'id': '6c413323-383a-49dc-88f9-a22800cab024',
  27. 'ext': 'wmv',
  28. 'title': 'Developer Kick-Off Session: Stuff We Love',
  29. 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
  30. 'duration': 4576,
  31. 'thumbnail': r're:https?://.*\.jpg',
  32. 'timestamp': 1377717420,
  33. 'upload_date': '20130828',
  34. 'session_code': 'KOS002',
  35. 'session_room': 'Arena 1A',
  36. 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
  37. },
  38. }, {
  39. 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  40. 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
  41. 'info_dict': {
  42. 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
  43. 'ext': 'wmv',
  44. 'title': 'Self-service BI with Power BI - nuclear testing',
  45. 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
  46. 'duration': 1540,
  47. 'thumbnail': r're:https?://.*\.jpg',
  48. 'timestamp': 1386381991,
  49. 'upload_date': '20131207',
  50. 'authors': ['Mike Wilmot'],
  51. },
  52. }, {
  53. # low quality mp4 is best
  54. 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  55. 'info_dict': {
  56. 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
  57. 'ext': 'mp4',
  58. 'title': 'Ranges for the Standard Library',
  59. 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
  60. 'duration': 5646,
  61. 'thumbnail': r're:https?://.*\.jpg',
  62. 'upload_date': '20150930',
  63. 'timestamp': 1443640735,
  64. },
  65. 'params': {
  66. 'skip_download': True,
  67. },
  68. }, {
  69. 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
  70. 'info_dict': {
  71. 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
  72. 'title': 'Channel 9',
  73. },
  74. 'playlist_mincount': 100,
  75. }, {
  76. 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
  77. 'only_matching': True,
  78. }, {
  79. 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
  80. 'only_matching': True,
  81. }]
  82. _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  83. def _extract_list(self, video_id, rss_url=None):
  84. if not rss_url:
  85. rss_url = self._RSS_URL % video_id
  86. rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
  87. entries = [self.url_result(session_url.text, 'Channel9')
  88. for session_url in rss.findall('./channel/item/link')]
  89. title_text = rss.find('./channel/title').text
  90. return self.playlist_result(entries, video_id, title_text)
  91. def _real_extract(self, url):
  92. content_path, rss = re.match(self._VALID_URL, url).groups()
  93. if rss:
  94. return self._extract_list(content_path, url)
  95. webpage = self._download_webpage(
  96. url, content_path, 'Downloading web page')
  97. episode_data = self._search_regex(
  98. r"data-episode='([^']+)'", webpage, 'episode data', default=None)
  99. if episode_data:
  100. episode_data = self._parse_json(unescapeHTML(
  101. episode_data), content_path)
  102. content_id = episode_data['contentId']
  103. is_session = '/Sessions(' in episode_data['api']
  104. content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
  105. if is_session:
  106. content_url += '?$expand=Speakers'
  107. else:
  108. content_url += '?$expand=Authors'
  109. content_data = self._download_json(content_url, content_id)
  110. title = content_data['Title']
  111. QUALITIES = (
  112. 'mp3',
  113. 'wmv', 'mp4',
  114. 'wmv-low', 'mp4-low',
  115. 'wmv-mid', 'mp4-mid',
  116. 'wmv-high', 'mp4-high',
  117. )
  118. quality_key = qualities(QUALITIES)
  119. def quality(quality_id, format_url):
  120. return (len(QUALITIES) if '_Source.' in format_url
  121. else quality_key(quality_id))
  122. formats = []
  123. urls = set()
  124. SITE_QUALITIES = {
  125. 'MP3': 'mp3',
  126. 'MP4': 'mp4',
  127. 'Low Quality WMV': 'wmv-low',
  128. 'Low Quality MP4': 'mp4-low',
  129. 'Mid Quality WMV': 'wmv-mid',
  130. 'Mid Quality MP4': 'mp4-mid',
  131. 'High Quality WMV': 'wmv-high',
  132. 'High Quality MP4': 'mp4-high',
  133. }
  134. formats_select = self._search_regex(
  135. r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
  136. 'formats select', default=None)
  137. if formats_select:
  138. for mobj in re.finditer(
  139. r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
  140. formats_select):
  141. format_url = mobj.group('url')
  142. if format_url in urls:
  143. continue
  144. urls.add(format_url)
  145. format_id = mobj.group('format')
  146. quality_id = SITE_QUALITIES.get(format_id, format_id)
  147. formats.append({
  148. 'url': format_url,
  149. 'format_id': quality_id,
  150. 'quality': quality(quality_id, format_url),
  151. 'vcodec': 'none' if quality_id == 'mp3' else None,
  152. })
  153. API_QUALITIES = {
  154. 'VideoMP4Low': 'mp4-low',
  155. 'VideoWMV': 'wmv-mid',
  156. 'VideoMP4Medium': 'mp4-mid',
  157. 'VideoMP4High': 'mp4-high',
  158. 'VideoWMVHQ': 'wmv-hq',
  159. }
  160. for format_id, q in API_QUALITIES.items():
  161. q_url = content_data.get(format_id)
  162. if not q_url or q_url in urls:
  163. continue
  164. urls.add(q_url)
  165. formats.append({
  166. 'url': q_url,
  167. 'format_id': q,
  168. 'quality': quality(q, q_url),
  169. })
  170. self._sort_formats(formats)
  171. slides = content_data.get('Slides')
  172. zip_file = content_data.get('ZipFile')
  173. if not formats and not slides and not zip_file:
  174. raise ExtractorError(
  175. 'None of recording, slides or zip are available for %s' % content_path)
  176. subtitles = {}
  177. for caption in content_data.get('Captions', []):
  178. caption_url = caption.get('Url')
  179. if not caption_url:
  180. continue
  181. subtitles.setdefault(caption.get('Language', 'en'), []).append({
  182. 'url': caption_url,
  183. 'ext': 'vtt',
  184. })
  185. common = {
  186. 'id': content_id,
  187. 'title': title,
  188. 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
  189. 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
  190. 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
  191. 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
  192. 'avg_rating': int_or_none(content_data.get('Rating')),
  193. 'rating_count': int_or_none(content_data.get('RatingCount')),
  194. 'view_count': int_or_none(content_data.get('Views')),
  195. 'comment_count': int_or_none(content_data.get('CommentCount')),
  196. 'subtitles': subtitles,
  197. }
  198. if is_session:
  199. speakers = []
  200. for s in content_data.get('Speakers', []):
  201. speaker_name = s.get('FullName')
  202. if not speaker_name:
  203. continue
  204. speakers.append(speaker_name)
  205. common.update({
  206. 'session_code': content_data.get('Code'),
  207. 'session_room': content_data.get('Room'),
  208. 'session_speakers': speakers,
  209. })
  210. else:
  211. authors = []
  212. for a in content_data.get('Authors', []):
  213. author_name = a.get('DisplayName')
  214. if not author_name:
  215. continue
  216. authors.append(author_name)
  217. common['authors'] = authors
  218. contents = []
  219. if slides:
  220. d = common.copy()
  221. d.update({'title': title + '-Slides', 'url': slides})
  222. contents.append(d)
  223. if zip_file:
  224. d = common.copy()
  225. d.update({'title': title + '-Zip', 'url': zip_file})
  226. contents.append(d)
  227. if formats:
  228. d = common.copy()
  229. d.update({'title': title, 'formats': formats})
  230. contents.append(d)
  231. return self.playlist_result(contents)
  232. else:
  233. return self._extract_list(content_path)