stanfordoc.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. compat_str,
  5. ExtractorError,
  6. orderedSet,
  7. unescapeHTML,
  8. )
  9. class StanfordOpenClassroomIE(InfoExtractor):
  10. IE_NAME = u'stanfordoc'
  11. IE_DESC = u'Stanford Open ClassRoom'
  12. _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  13. _TEST = {
  14. u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
  15. u'file': u'PracticalUnix_intro-environment.mp4',
  16. u'md5': u'544a9468546059d4e80d76265b0443b8',
  17. u'info_dict': {
  18. u"title": u"Intro Environment"
  19. }
  20. }
  21. def _real_extract(self, url):
  22. mobj = re.match(self._VALID_URL, url)
  23. if mobj is None:
  24. raise ExtractorError(u'Invalid URL: %s' % url)
  25. if mobj.group('course') and mobj.group('video'): # A specific video
  26. course = mobj.group('course')
  27. video = mobj.group('video')
  28. info = {
  29. 'id': course + '_' + video,
  30. 'uploader': None,
  31. 'upload_date': None,
  32. }
  33. self.report_extraction(info['id'])
  34. baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  35. xmlUrl = baseUrl + video + '.xml'
  36. mdoc = self._download_xml(xmlUrl, info['id'])
  37. try:
  38. info['title'] = mdoc.findall('./title')[0].text
  39. info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  40. except IndexError:
  41. raise ExtractorError(u'Invalid metadata XML file')
  42. info['ext'] = info['url'].rpartition('.')[2]
  43. return [info]
  44. elif mobj.group('course'): # A course page
  45. course = mobj.group('course')
  46. info = {
  47. 'id': course,
  48. 'type': 'playlist',
  49. 'uploader': None,
  50. 'upload_date': None,
  51. }
  52. coursepage = self._download_webpage(url, info['id'],
  53. note='Downloading course info page',
  54. errnote='Unable to download course info page')
  55. info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  56. info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  57. coursepage, u'description', fatal=False)
  58. links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  59. info['list'] = [
  60. {
  61. 'type': 'reference',
  62. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  63. }
  64. for vpage in links]
  65. results = []
  66. for entry in info['list']:
  67. assert entry['type'] == 'reference'
  68. results += self.extract(entry['url'])
  69. return results
  70. else: # Root page
  71. info = {
  72. 'id': 'Stanford OpenClassroom',
  73. 'type': 'playlist',
  74. 'uploader': None,
  75. 'upload_date': None,
  76. }
  77. rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  78. rootpage = self._download_webpage(rootURL, info['id'],
  79. errnote=u'Unable to download course info page')
  80. info['title'] = info['id']
  81. links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
  82. info['list'] = [
  83. {
  84. 'type': 'reference',
  85. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
  86. }
  87. for cpage in links]
  88. results = []
  89. for entry in info['list']:
  90. assert entry['type'] == 'reference'
  91. results += self.extract(entry['url'])
  92. return results