videolecturesnet.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..compat import (
  5. compat_HTTPError,
  6. compat_urlparse,
  7. )
  8. from ..utils import (
  9. ExtractorError,
  10. parse_duration,
  11. js_to_json,
  12. parse_iso8601,
  13. )
  14. class VideoLecturesNetIE(InfoExtractor):
  15. _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?'
  16. IE_NAME = 'videolectures.net'
  17. _TESTS = [{
  18. 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
  19. 'info_dict': {
  20. 'id': '20171_part1',
  21. 'ext': 'mp4',
  22. 'title': 'Automatics, robotics and biocybernetics',
  23. 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
  24. 'upload_date': '20130627',
  25. 'duration': 565,
  26. 'thumbnail': 're:http://.*\.jpg',
  27. },
  28. }, {
  29. # video with invalid direct format links (HTTP 403)
  30. 'url': 'http://videolectures.net/russir2010_filippova_nlp/',
  31. 'info_dict': {
  32. 'id': '14891_part1',
  33. 'ext': 'flv',
  34. 'title': 'NLP at Google',
  35. 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
  36. 'duration': 5352,
  37. 'thumbnail': 're:http://.*\.jpg',
  38. },
  39. 'params': {
  40. # rtmp download
  41. 'skip_download': True,
  42. },
  43. }, {
  44. 'url': 'http://videolectures.net/deeplearning2015_montreal/',
  45. 'info_dict': {
  46. 'id': '23181',
  47. 'title': 'Deep Learning Summer School, Montreal 2015',
  48. 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
  49. 'timestamp': 1438560000,
  50. },
  51. 'playlist_count': 30,
  52. }, {
  53. # multi part lecture
  54. 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
  55. 'info_dict': {
  56. 'id': '9737',
  57. 'title': 'Introduction To Bayesian Inference',
  58. 'timestamp': 1251622800,
  59. },
  60. 'playlist': [{
  61. 'info_dict': {
  62. 'id': '9737_part1',
  63. 'ext': 'wmv',
  64. 'title': 'Introduction To Bayesian Inference',
  65. },
  66. }, {
  67. 'info_dict': {
  68. 'id': '9737_part2',
  69. 'ext': 'wmv',
  70. 'title': 'Introduction To Bayesian Inference',
  71. },
  72. }],
  73. 'playlist_count': 2,
  74. }]
  75. def _real_extract(self, url):
  76. lecture_slug, part = re.match(self._VALID_URL, url).groups()
  77. webpage = self._download_webpage(url, lecture_slug)
  78. cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json)
  79. lecture_id = str(cfg['obj_id'])
  80. lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0]
  81. lecture_info = {
  82. 'id': lecture_id,
  83. 'display_id': lecture_slug,
  84. 'title': lecture_data['title'],
  85. 'timestamp': parse_iso8601(lecture_data.get('time')),
  86. 'description': lecture_data.get('description_wiki'),
  87. 'thumbnail': lecture_data.get('thumb'),
  88. }
  89. entries = []
  90. parts = cfg.get('videos')
  91. if parts:
  92. if len(parts) == 1:
  93. part = str(parts[0])
  94. if part:
  95. smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part)
  96. smil = self._download_smil(smil_url, lecture_id)
  97. info = self._parse_smil(smil, smil_url, lecture_id)
  98. info['id'] = '%s_part%s' % (lecture_id, part)
  99. switch = smil.find('.//switch')
  100. if switch is not None:
  101. info['duration'] = parse_duration(switch.attrib.get('dur'))
  102. return info
  103. else:
  104. for part in parts:
  105. entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet'))
  106. lecture_info['_type'] = 'multi_video'
  107. else:
  108. # Probably a playlist
  109. entries = [
  110. self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet')
  111. for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)]
  112. lecture_info['_type'] = 'playlist'
  113. lecture_info['entries'] = entries
  114. return lecture_info