12 years ago · f8b362739e
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -70,6 +70,7 @@ from .roxwel import RoxwelIE
 
				 from .rtlnow import RTLnowIE
			
 
				 from .sina import SinaIE
			
 
				 from .slashdot import SlashdotIE
			
 
				+from .sohu import SohuIE
			
 
				 from .soundcloud import SoundcloudIE, SoundcloudSetIE
			
 
				 from .spiegel import SpiegelIE
			
 
				 from .stanfordoc import StanfordOpenClassroomIE
			
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -21,8 +21,10 @@ class KankanIE(InfoExtractor):
 
				         video_id = mobj.group('id')
			
 
				         webpage = self._download_webpage(url, video_id)
			
 
				 
			
 
				-        title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title')
			
 
				-        gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid')
			
 
				+        title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, u'video title')
			
 
				+        surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0)
			
 
				+        gcids = re.findall(r"http://.+?/.+?/(.+?)/", surls)
			
 
				+        gcid = gcids[-1]
			
 
				 
			
 
				         video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,
			
 
				                                                  video_id, u'Downloading video url info')
			
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -0,0 +1,93 @@
 
				+# encoding: utf-8
			
 
				+
			
 
				+import re
			
 
				+import json
			
 
				+import time
			
 
				+import logging
			
 
				+import urllib2
			
 
				+
			
 
				+from .common import InfoExtractor
			
 
				+from ..utils import compat_urllib_request, clean_html
			
 
				+
			
 
				+
			
 
				+class SohuIE(InfoExtractor):
			
 
				+    _VALID_URL = r'https?://tv\.sohu\.com/\d+?/n(?P<id>\d+)\.shtml.*?'
			
 
				+
			
 
				+    _TEST = {
			
 
				+        u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
			
 
				+        u'file': u'382479172.flv',
			
 
				+        u'md5': u'cc84eed6b6fbf0f2f9a8d3cb9da1939b',
			
 
				+        u'info_dict': {
			
 
				+            u'title': u'The Illest - Far East Movement Riff Raff',
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    def _real_extract(self, url):
			
 
				+        mobj = re.match(self._VALID_URL, url)
			
 
				+        video_id = mobj.group('id')
			
 
				+        webpage = self._download_webpage(url, video_id)
			
 
				+        pattern = r'<title>(.+?)</title>'
			
 
				+        compiled = re.compile(pattern, re.DOTALL)
			
 
				+        title = self._search_regex(compiled, webpage, u'video title')
			
 
				+        title = clean_html(title).split('-')[0].strip()
			
 
				+        self.to_screen('Title: %s' % title)
			
 
				+        pattern = re.compile(r'var vid="(\d+)"')
			
 
				+        result = re.search(pattern, webpage)
			
 
				+        if not result:
			
 
				+            logging.info('[Sohu] could not get vid')
			
 
				+            return None
			
 
				+        vid = result.group(1)
			
 
				+        logging.info('vid: %s' % vid)
			
 
				+        base_url_1 = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
			
 
				+        url_1 = base_url_1 + vid
			
 
				+        logging.info('json url: %s' % url_1)
			
 
				+        webpage = self._download_webpage(url_1, vid)
			
 
				+        json_1 = json.loads(webpage)
			
 
				+        # get the highest definition video vid and json infomation.
			
 
				+        vids = []
			
 
				+        qualities = ('oriVid', 'superVid', 'highVid', 'norVid')
			
 
				+        for vid_name in qualities:
			
 
				+            vids.append(json_1['data'][vid_name])
			
 
				+        clearest_vid = 0
			
 
				+        for i, v in enumerate(vids):
			
 
				+            if v != 0:
			
 
				+                clearest_vid = v
			
 
				+                logging.info('quality definition: %s' % qualities[i][:-3])
			
 
				+                break
			
 
				+        if not clearest_vid:
			
 
				+            logging.warning('could not find valid clearest_vid')
			
 
				+            return None
			
 
				+        if vid != clearest_vid:
			
 
				+            url_1 = '%s%d' % (base_url_1, clearest_vid)
			
 
				+            logging.info('highest definition json url: %s' % url_1)
			
 
				+            json_1 = json.loads(urllib2.urlopen(url_1).read())
			
 
				+        allot = json_1['allot']
			
 
				+        prot = json_1['prot']
			
 
				+        clipsURL = json_1['data']['clipsURL']
			
 
				+        su = json_1['data']['su']
			
 
				+        num_of_parts = json_1['data']['totalBlocks']
			
 
				+        logging.info('Total parts: %d' % num_of_parts)
			
 
				+        base_url_3 = 'http://allot/?prot=prot&file=clipsURL[i]&new=su[i]'
			
 
				+        files_info = []
			
 
				+        for i in range(num_of_parts):
			
 
				+            self.to_screen('Geting json infomation of part %s/%s' % (i + 1, num_of_parts))
			
 
				+            middle_url = 'http://%s/?prot=%s&file=%s&new=%s' % (allot, prot, clipsURL[i], su[i])
			
 
				+            logging.info('middle url part %d: %s' % (i, middle_url))
			
 
				+            middle_info = urllib2.urlopen(middle_url).read().split('|')
			
 
				+            middle_part_1 = middle_info[0]
			
 
				+            download_url = '%s%s?key=%s' % (middle_info[0], su[i], middle_info[3])
			
 
				+
			
 
				+            info = {
			
 
				+                'id': '%s_part%02d' % (video_id, i + 1),
			
 
				+                'title': title,
			
 
				+                'url': download_url,
			
 
				+                'ext': 'mp4',
			
 
				+            }
			
 
				+            files_info.append(info)
			
 
				+            time.sleep(1)
			
 
				+        if num_of_parts == 1:
			
 
				+            info =  files_info[0]
			
 
				+            info['id'] = video_id
			
 
				+            return info
			
 
				+        return files_info