瀏覽代碼

Fix MIT extractor for Python 2.6

The HTML for the MIT page does not parse cleanly for Python 2.6 due
to script tags within an actual script element.  The offending piece
is inside a comment block, so removing all such comment blocks
fixes the parsing.
Jeff Smith 12 年之前
父節點
當前提交
b5ba7b9dcf
共有 1 個文件被更改,包括 7 次插入9 次删除
  1. 7 9
      youtube_dl/extractor/mit.py

+ 7 - 9
youtube_dl/extractor/mit.py

@@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        webpage = self._download_webpage(
+        raw_page = self._download_webpage(
             'http://techtv.mit.edu/videos/%s' % video_id, video_id)
-        embed_page = self._download_webpage(
-            'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
-            note=u'Downloading embed page')
+        clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
 
         base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
-            embed_page, u'base url')
-        formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
+            raw_page, u'base url')
+        formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
             u'video formats')
         formats = json.loads(formats_json)
         formats = sorted(formats, key=lambda f: f['bitrate'])
 
-        title = get_element_by_id('edit-title', webpage)
-        description = clean_html(get_element_by_id('edit-description', webpage))
+        title = get_element_by_id('edit-title', clean_page)
+        description = clean_html(get_element_by_id('edit-description', clean_page))
         thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
-            embed_page, u'thumbnail', flags=re.DOTALL)
+            raw_page, u'thumbnail', flags=re.DOTALL)
 
         return {'id': video_id,
                 'title': title,