il y a 14 ans · 0b14e0b367
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -282,6 +282,14 @@ def _simplify_title(title):
 
				 	expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
			
 
				 	return expr.sub(u'_', title).strip(u'_')
			
 
				 
			
 
				+def _orderedSet(iterable):
			
 
				+	""" Remove all duplicates from the input iterable """
			
 
				+	res = []
			
 
				+	for el in iterable:
			
 
				+		if el not in res:
			
 
				+			res.append(el)
			
 
				+	return res
			
 
				+
			
 
				 class DownloadError(Exception):
			
 
				 	"""Download Error exception.
			
 
				 
			
@@ -711,25 +719,6 @@ class FileDownloader(object):
 
				 			return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
			
 
				 		return None
			
 
				 
			
 
				-	def process_dict(self, info_dict):
			
 
				-		""" Download and handle the extracted information.
			
 
				-		For details on the specification of the various types of content, refer to the _process_* functions. """
			
 
				-		if info_dict['type'] == 'playlist':
			
 
				-			self._process_playlist(info_dict)
			
 
				-		elif info_dict['type'] == 'legacy-video':
			
 
				-			self.process_info(info_dict)
			
 
				-		else:
			
 
				-			raise ValueError('Invalid item type')
			
 
				-
			
 
				-	def _process_playlist(self, info_dict):
			
 
				-		assert info_dict['type'] == 'playlist'
			
 
				-		assert 'title' in info_dict
			
 
				-		assert 'stitle' in info_dict
			
 
				-		entries = info_dict['list']
			
 
				-
			
 
				-		for e in entries:
			
 
				-			self.process_dict(e)
			
 
				-
			
 
				 	def process_info(self, info_dict):
			
 
				 		"""Process a single dictionary returned by an InfoExtractor."""
			
 
				 
			
@@ -3766,9 +3755,13 @@ class MixcloudIE(InfoExtractor):
 
				 class StanfordOpenClassroomIE(InfoExtractor):
			
 
				 	"""Information extractor for Stanford's Open ClassRoom"""
			
 
				 
			
 
				-	_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
			
 
				+	_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
			
 
				 	IE_NAME = u'stanfordoc'
			
 
				 
			
 
				+	def report_download_webpage(self, objid):
			
 
				+		"""Report information extraction."""
			
 
				+		self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
			
 
				+
			
 
				 	def report_extraction(self, video_id):
			
 
				 		"""Report information extraction."""
			
 
				 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
			
@@ -3792,7 +3785,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
 
				 			try:
			
 
				 				metaXml = urllib2.urlopen(xmlUrl).read()
			
 
				 			except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			
 
				-				self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
			
 
				+				self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
			
 
				 				return
			
 
				 			mdoc = xml.etree.ElementTree.fromstring(metaXml)
			
 
				 			try:
			
@@ -3809,13 +3802,74 @@ class StanfordOpenClassroomIE(InfoExtractor):
 
				 				self._downloader.process_info(info)
			
 
				 			except UnavailableVideoError, err:
			
 
				 				self._downloader.trouble(u'\nERROR: unable to download video')
			
 
				-		else:
			
 
				-			print('TODO: Not yet implemented')
			
 
				-			1/0
			
 
				+		elif mobj.group('course'): # A course page
			
 
				+			unescapeHTML = HTMLParser.HTMLParser().unescape
			
 
				 
			
 
				+			course = mobj.group('course')
			
 
				+			info = {
			
 
				+				'id': _simplify_title(course),
			
 
				+				'type': 'playlist',
			
 
				+			}
			
 
				 
			
 
				+			self.report_download_webpage(info['id'])
			
 
				+			try:
			
 
				+				coursepage = urllib2.urlopen(url).read()
			
 
				+			except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			
 
				+				self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
			
 
				+				return
			
 
				 
			
 
				+			m = re.search('<h1>([^<]+)</h1>', coursepage)
			
 
				+			if m:
			
 
				+				info['title'] = unescapeHTML(m.group(1))
			
 
				+			else:
			
 
				+				info['title'] = info['id']
			
 
				+			info['stitle'] = _simplify_title(info['title'])
			
 
				+
			
 
				+			m = re.search('<description>([^<]+)</description>', coursepage)
			
 
				+			if m:
			
 
				+				info['description'] = unescapeHTML(m.group(1))
			
 
				+
			
 
				+			links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
			
 
				+			info['list'] = [
			
 
				+				{
			
 
				+					'type': 'reference',
			
 
				+					'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
			
 
				+				}
			
 
				+					for vpage in links]
			
 
				+
			
 
				+			for entry in info['list']:
			
 
				+				assert entry['type'] == 'reference'
			
 
				+				self.extract(entry['url'])
			
 
				+		else: # Root page
			
 
				+			unescapeHTML = HTMLParser.HTMLParser().unescape
			
 
				+
			
 
				+			info = {
			
 
				+				'id': 'Stanford OpenClassroom',
			
 
				+				'type': 'playlist',
			
 
				+			}
			
 
				+
			
 
				+			self.report_download_webpage(info['id'])
			
 
				+			rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
			
 
				+			try:
			
 
				+				rootpage = urllib2.urlopen(rootURL).read()
			
 
				+			except (urllib2.URLError, httplib.HTTPException, socket.error), err:
			
 
				+				self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
			
 
				+				return
			
 
				+
			
 
				+			info['title'] = info['id']
			
 
				+			info['stitle'] = _simplify_title(info['title'])
			
 
				+
			
 
				+			links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
			
 
				+			info['list'] = [
			
 
				+				{
			
 
				+					'type': 'reference',
			
 
				+					'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
			
 
				+				}
			
 
				+					for cpage in links]
			
 
				 
			
 
				+			for entry in info['list']:
			
 
				+				assert entry['type'] == 'reference'
			
 
				+				self.extract(entry['url'])
			
 
				 
			
 
				 
			
 
				 class PostProcessor(object):