iqiyi.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import hashlib
  4. import itertools
  5. import math
  6. import os
  7. import random
  8. import re
  9. import time
  10. import uuid
  11. from .common import InfoExtractor
  12. from ..compat import (
  13. compat_parse_qs,
  14. compat_str,
  15. compat_urllib_parse,
  16. compat_urllib_parse_urlparse,
  17. )
  18. from ..utils import (
  19. base62,
  20. ExtractorError,
  21. ohdave_rsa_encrypt,
  22. remove_start,
  23. sanitized_Request,
  24. urlencode_postdata,
  25. url_basename,
  26. )
  27. def md5_text(text):
  28. return hashlib.md5(text.encode('utf-8')).hexdigest()
  29. class IqiyiSDK(object):
  30. def __init__(self, target, ip, timestamp):
  31. self.target = target
  32. self.ip = ip
  33. self.timestamp = timestamp
  34. @staticmethod
  35. def split_sum(data):
  36. return compat_str(sum(map(lambda p: int(p, 16), list(data))))
  37. @staticmethod
  38. def digit_sum(num):
  39. if isinstance(num, int):
  40. num = compat_str(num)
  41. return compat_str(sum(map(int, num)))
  42. def even_odd(self):
  43. even = self.digit_sum(compat_str(self.timestamp)[::2])
  44. odd = self.digit_sum(compat_str(self.timestamp)[1::2])
  45. return even, odd
  46. def preprocess(self, chunksize):
  47. self.target = md5_text(self.target)
  48. chunks = []
  49. for i in range(32 // chunksize):
  50. chunks.append(self.target[chunksize * i:chunksize * (i + 1)])
  51. if 32 % chunksize:
  52. chunks.append(self.target[32 - 32 % chunksize:])
  53. return chunks, list(map(int, self.ip.split('.')))
  54. def mod(self, modulus):
  55. chunks, ip = self.preprocess(32)
  56. self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip))
  57. def split(self, chunksize):
  58. modulus_map = {
  59. 4: 256,
  60. 5: 10,
  61. 8: 100,
  62. }
  63. chunks, ip = self.preprocess(chunksize)
  64. ret = ''
  65. for i in range(len(chunks)):
  66. ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else ''
  67. if chunksize == 8:
  68. ret += ip_part + chunks[i]
  69. else:
  70. ret += chunks[i] + ip_part
  71. self.target = ret
  72. def handle_input16(self):
  73. self.target = md5_text(self.target)
  74. self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:])
  75. def handle_input8(self):
  76. self.target = md5_text(self.target)
  77. ret = ''
  78. for i in range(4):
  79. part = self.target[8 * i:8 * (i + 1)]
  80. ret += self.split_sum(part) + part
  81. self.target = ret
  82. def handleSum(self):
  83. self.target = md5_text(self.target)
  84. self.target = self.split_sum(self.target) + self.target
  85. def date(self, scheme):
  86. self.target = md5_text(self.target)
  87. d = time.localtime(self.timestamp)
  88. strings = {
  89. 'y': compat_str(d.tm_year),
  90. 'm': '%02d' % d.tm_mon,
  91. 'd': '%02d' % d.tm_mday,
  92. }
  93. self.target += ''.join(map(lambda c: strings[c], list(scheme)))
  94. def split_time_even_odd(self):
  95. even, odd = self.even_odd()
  96. self.target = odd + md5_text(self.target) + even
  97. def split_time_odd_even(self):
  98. even, odd = self.even_odd()
  99. self.target = even + md5_text(self.target) + odd
  100. def split_ip_time_sum(self):
  101. chunks, ip = self.preprocess(32)
  102. self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp)
  103. def split_time_ip_sum(self):
  104. chunks, ip = self.preprocess(32)
  105. self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip))
  106. class IqiyiSDKInterpreter(object):
  107. def __init__(self, sdk_code):
  108. self.sdk_code = sdk_code
  109. def decode_eval_codes(self):
  110. self.sdk_code = self.sdk_code[5:-3]
  111. mobj = re.search(
  112. r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}",
  113. self.sdk_code)
  114. obfucasted_code, count, symbols = mobj.groups()
  115. count = int(count)
  116. symbols = symbols.split('|')
  117. symbol_table = {}
  118. while count:
  119. count -= 1
  120. b62count = base62(count)
  121. symbol_table[b62count] = symbols[count] or b62count
  122. self.sdk_code = re.sub(
  123. r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
  124. obfucasted_code)
  125. def run(self, target, ip, timestamp):
  126. self.decode_eval_codes()
  127. functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
  128. sdk = IqiyiSDK(target, ip, timestamp)
  129. other_functions = {
  130. 'handleSum': sdk.handleSum,
  131. 'handleInput8': sdk.handle_input8,
  132. 'handleInput16': sdk.handle_input16,
  133. 'splitTimeEvenOdd': sdk.split_time_even_odd,
  134. 'splitTimeOddEven': sdk.split_time_odd_even,
  135. 'splitIpTimeSum': sdk.split_ip_time_sum,
  136. 'splitTimeIpSum': sdk.split_time_ip_sum,
  137. }
  138. for function in functions:
  139. if re.match(r'mod\d+', function):
  140. sdk.mod(int(function[3:]))
  141. elif re.match(r'date[ymd]{3}', function):
  142. sdk.date(function[4:])
  143. elif re.match(r'split\d+', function):
  144. sdk.split(int(function[5:]))
  145. elif function in other_functions:
  146. other_functions[function]()
  147. else:
  148. raise ExtractorError('Unknown funcion %s' % function)
  149. return sdk.target
  150. class IqiyiIE(InfoExtractor):
  151. IE_NAME = 'iqiyi'
  152. IE_DESC = '爱奇艺'
  153. _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
  154. _NETRC_MACHINE = 'iqiyi'
  155. _TESTS = [{
  156. 'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
  157. 'md5': '2cb594dc2781e6c941a110d8f358118b',
  158. 'info_dict': {
  159. 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
  160. 'title': '美国德州空中惊现奇异云团 酷似UFO',
  161. 'ext': 'f4v',
  162. }
  163. }, {
  164. 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
  165. 'info_dict': {
  166. 'id': 'e3f585b550a280af23c98b6cb2be19fb',
  167. 'title': '名侦探柯南第752集',
  168. },
  169. 'playlist': [{
  170. 'info_dict': {
  171. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
  172. 'ext': 'f4v',
  173. 'title': '名侦探柯南第752集',
  174. },
  175. }, {
  176. 'info_dict': {
  177. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
  178. 'ext': 'f4v',
  179. 'title': '名侦探柯南第752集',
  180. },
  181. }, {
  182. 'info_dict': {
  183. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
  184. 'ext': 'f4v',
  185. 'title': '名侦探柯南第752集',
  186. },
  187. }, {
  188. 'info_dict': {
  189. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
  190. 'ext': 'f4v',
  191. 'title': '名侦探柯南第752集',
  192. },
  193. }, {
  194. 'info_dict': {
  195. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
  196. 'ext': 'f4v',
  197. 'title': '名侦探柯南第752集',
  198. },
  199. }, {
  200. 'info_dict': {
  201. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
  202. 'ext': 'f4v',
  203. 'title': '名侦探柯南第752集',
  204. },
  205. }, {
  206. 'info_dict': {
  207. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
  208. 'ext': 'f4v',
  209. 'title': '名侦探柯南第752集',
  210. },
  211. }, {
  212. 'info_dict': {
  213. 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
  214. 'ext': 'f4v',
  215. 'title': '名侦探柯南第752集',
  216. },
  217. }],
  218. 'params': {
  219. 'skip_download': True,
  220. },
  221. }, {
  222. 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
  223. 'only_matching': True,
  224. }, {
  225. 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html',
  226. 'only_matching': True,
  227. }, {
  228. 'url': 'http://yule.iqiyi.com/pcb.html',
  229. 'only_matching': True,
  230. }, {
  231. # VIP-only video. The first 2 parts (6 minutes) are available without login
  232. # MD5 sums omitted as values are different on Travis CI and my machine
  233. 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
  234. 'info_dict': {
  235. 'id': 'f3cf468b39dddb30d676f89a91200dc1',
  236. 'title': '泰坦尼克号',
  237. },
  238. 'playlist': [{
  239. 'info_dict': {
  240. 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
  241. 'ext': 'f4v',
  242. 'title': '泰坦尼克号',
  243. },
  244. }, {
  245. 'info_dict': {
  246. 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
  247. 'ext': 'f4v',
  248. 'title': '泰坦尼克号',
  249. },
  250. }],
  251. 'expected_warnings': ['Needs a VIP account for full video'],
  252. }, {
  253. 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
  254. 'info_dict': {
  255. 'id': '202918101',
  256. 'title': '灌篮高手 国语版',
  257. },
  258. 'playlist_count': 101,
  259. }]
  260. _FORMATS_MAP = [
  261. ('1', 'h6'),
  262. ('2', 'h5'),
  263. ('3', 'h4'),
  264. ('4', 'h3'),
  265. ('5', 'h2'),
  266. ('10', 'h1'),
  267. ]
  268. def _real_initialize(self):
  269. self._login()
  270. @staticmethod
  271. def _rsa_fun(data):
  272. # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
  273. N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
  274. e = 65537
  275. return ohdave_rsa_encrypt(data, e, N)
  276. def _login(self):
  277. (username, password) = self._get_login_info()
  278. # No authentication to be performed
  279. if not username:
  280. return True
  281. data = self._download_json(
  282. 'http://kylin.iqiyi.com/get_token', None,
  283. note='Get token for logging', errnote='Unable to get token for logging')
  284. sdk = data['sdk']
  285. timestamp = int(time.time())
  286. target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % (
  287. username, self._rsa_fun(password.encode('utf-8')))
  288. interp = IqiyiSDKInterpreter(sdk)
  289. sign = interp.run(target, data['ip'], timestamp)
  290. validation_params = {
  291. 'target': target,
  292. 'server': 'BEA3AA1908656AABCCFF76582C4C6660',
  293. 'token': data['token'],
  294. 'bird_src': 'f8d91d57af224da7893dd397d52d811a',
  295. 'sign': sign,
  296. 'bird_t': timestamp,
  297. }
  298. validation_result = self._download_json(
  299. 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
  300. note='Validate credentials', errnote='Unable to validate credentials')
  301. MSG_MAP = {
  302. 'P00107': 'please login via the web interface and enter the CAPTCHA code',
  303. 'P00117': 'bad username or password',
  304. }
  305. code = validation_result['code']
  306. if code != 'A00000':
  307. msg = MSG_MAP.get(code)
  308. if not msg:
  309. msg = 'error %s' % code
  310. if validation_result.get('msg'):
  311. msg += ': ' + validation_result['msg']
  312. self._downloader.report_warning('unable to log in: ' + msg)
  313. return False
  314. return True
  315. def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
  316. auth_params = {
  317. # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
  318. 'version': '2.0',
  319. 'platform': 'b6c13e26323c537d',
  320. 'aid': tvid,
  321. 'tvid': tvid,
  322. 'uid': '',
  323. 'deviceId': _uuid,
  324. 'playType': 'main', # XXX: always main?
  325. 'filename': os.path.splitext(url_basename(api_video_url))[0],
  326. }
  327. qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
  328. for key, val in qd_items.items():
  329. auth_params[key] = val[0]
  330. auth_req = sanitized_Request(
  331. 'http://api.vip.iqiyi.com/services/ckn.action',
  332. urlencode_postdata(auth_params))
  333. # iQiyi server throws HTTP 405 error without the following header
  334. auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  335. auth_result = self._download_json(
  336. auth_req, video_id,
  337. note='Downloading video authentication JSON',
  338. errnote='Unable to download video authentication JSON')
  339. if auth_result['code'] == 'Q00506': # requires a VIP account
  340. if do_report_warning:
  341. self.report_warning('Needs a VIP account for full video')
  342. return False
  343. return auth_result
  344. def construct_video_urls(self, data, video_id, _uuid, tvid):
  345. def do_xor(x, y):
  346. a = y % 3
  347. if a == 1:
  348. return x ^ 121
  349. if a == 2:
  350. return x ^ 72
  351. return x ^ 103
  352. def get_encode_code(l):
  353. a = 0
  354. b = l.split('-')
  355. c = len(b)
  356. s = ''
  357. for i in range(c - 1, -1, -1):
  358. a = do_xor(int(b[c - i - 1], 16), i)
  359. s += chr(a)
  360. return s[::-1]
  361. def get_path_key(x, format_id, segment_index):
  362. mg = ')(*&^flash@#$%a'
  363. tm = self._download_json(
  364. 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
  365. note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
  366. )['t']
  367. t = str(int(math.floor(int(tm) / (600.0))))
  368. return md5_text(t + mg + x)
  369. video_urls_dict = {}
  370. need_vip_warning_report = True
  371. for format_item in data['vp']['tkl'][0]['vs']:
  372. if 0 < int(format_item['bid']) <= 10:
  373. format_id = self.get_format(format_item['bid'])
  374. else:
  375. continue
  376. video_urls = []
  377. video_urls_info = format_item['fs']
  378. if not format_item['fs'][0]['l'].startswith('/'):
  379. t = get_encode_code(format_item['fs'][0]['l'])
  380. if t.endswith('mp4'):
  381. video_urls_info = format_item['flvs']
  382. for segment_index, segment in enumerate(video_urls_info):
  383. vl = segment['l']
  384. if not vl.startswith('/'):
  385. vl = get_encode_code(vl)
  386. is_vip_video = '/vip/' in vl
  387. filesize = segment['b']
  388. base_url = data['vp']['du'].split('/')
  389. if not is_vip_video:
  390. key = get_path_key(
  391. vl.split('/')[-1].split('.')[0], format_id, segment_index)
  392. base_url.insert(-1, key)
  393. base_url = '/'.join(base_url)
  394. param = {
  395. 'su': _uuid,
  396. 'qyid': uuid.uuid4().hex,
  397. 'client': '',
  398. 'z': '',
  399. 'bt': '',
  400. 'ct': '',
  401. 'tn': str(int(time.time()))
  402. }
  403. api_video_url = base_url + vl
  404. if is_vip_video:
  405. api_video_url = api_video_url.replace('.f4v', '.hml')
  406. auth_result = self._authenticate_vip_video(
  407. api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
  408. if auth_result is False:
  409. need_vip_warning_report = False
  410. break
  411. param.update({
  412. 't': auth_result['data']['t'],
  413. # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
  414. 'cid': 'afbe8fd3d73448c9',
  415. 'vid': video_id,
  416. 'QY00001': auth_result['data']['u'],
  417. })
  418. api_video_url += '?' if '?' not in api_video_url else '&'
  419. api_video_url += compat_urllib_parse.urlencode(param)
  420. js = self._download_json(
  421. api_video_url, video_id,
  422. note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
  423. video_url = js['l']
  424. video_urls.append(
  425. (video_url, filesize))
  426. video_urls_dict[format_id] = video_urls
  427. return video_urls_dict
  428. def get_format(self, bid):
  429. matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
  430. return matched_format_ids[0] if len(matched_format_ids) else None
  431. def get_bid(self, format_id):
  432. matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
  433. return matched_bids[0] if len(matched_bids) else None
  434. def get_raw_data(self, tvid, video_id, enc_key, _uuid):
  435. tm = str(int(time.time()))
  436. tail = tm + tvid
  437. param = {
  438. 'key': 'fvip',
  439. 'src': md5_text('youtube-dl'),
  440. 'tvId': tvid,
  441. 'vid': video_id,
  442. 'vinfo': 1,
  443. 'tm': tm,
  444. 'enc': md5_text(enc_key + tail),
  445. 'qyid': _uuid,
  446. 'tn': random.random(),
  447. 'um': 0,
  448. 'authkey': md5_text(md5_text('') + tail),
  449. 'k_tag': 1,
  450. }
  451. api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
  452. compat_urllib_parse.urlencode(param)
  453. raw_data = self._download_json(api_url, video_id)
  454. return raw_data
  455. def get_enc_key(self, swf_url, video_id):
  456. # TODO: automatic key extraction
  457. # last update at 2016-01-22 for Zombie::bite
  458. enc_key = '6ab6d0280511493ba85594779759d4ed'
  459. return enc_key
  460. def _extract_playlist(self, webpage):
  461. PAGE_SIZE = 50
  462. links = re.findall(
  463. r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
  464. webpage)
  465. if not links:
  466. return
  467. album_id = self._search_regex(
  468. r'albumId\s*:\s*(\d+),', webpage, 'album ID')
  469. album_title = self._search_regex(
  470. r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
  471. entries = list(map(self.url_result, links))
  472. # Start from 2 because links in the first page are already on webpage
  473. for page_num in itertools.count(2):
  474. pagelist_page = self._download_webpage(
  475. 'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
  476. album_id,
  477. note='Download playlist page %d' % page_num,
  478. errnote='Failed to download playlist page %d' % page_num)
  479. pagelist = self._parse_json(
  480. remove_start(pagelist_page, 'var tvInfoJs='), album_id)
  481. vlist = pagelist['data']['vlist']
  482. for item in vlist:
  483. entries.append(self.url_result(item['vurl']))
  484. if len(vlist) < PAGE_SIZE:
  485. break
  486. return self.playlist_result(entries, album_id, album_title)
  487. def _real_extract(self, url):
  488. webpage = self._download_webpage(
  489. url, 'temp_id', note='download video page')
  490. # There's no simple way to determine whether an URL is a playlist or not
  491. # So detect it
  492. playlist_result = self._extract_playlist(webpage)
  493. if playlist_result:
  494. return playlist_result
  495. tvid = self._search_regex(
  496. r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
  497. video_id = self._search_regex(
  498. r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
  499. swf_url = self._search_regex(
  500. r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
  501. _uuid = uuid.uuid4().hex
  502. enc_key = self.get_enc_key(swf_url, video_id)
  503. raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
  504. if raw_data['code'] != 'A000000':
  505. raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
  506. data = raw_data['data']
  507. title = data['vi']['vn']
  508. # generate video_urls_dict
  509. video_urls_dict = self.construct_video_urls(
  510. data, video_id, _uuid, tvid)
  511. # construct info
  512. entries = []
  513. for format_id in video_urls_dict:
  514. video_urls = video_urls_dict[format_id]
  515. for i, video_url_info in enumerate(video_urls):
  516. if len(entries) < i + 1:
  517. entries.append({'formats': []})
  518. entries[i]['formats'].append(
  519. {
  520. 'url': video_url_info[0],
  521. 'filesize': video_url_info[-1],
  522. 'format_id': format_id,
  523. 'preference': int(self.get_bid(format_id))
  524. }
  525. )
  526. for i in range(len(entries)):
  527. self._sort_formats(entries[i]['formats'])
  528. entries[i].update(
  529. {
  530. 'id': '%s_part%d' % (video_id, i + 1),
  531. 'title': title,
  532. }
  533. )
  534. if len(entries) > 1:
  535. info = {
  536. '_type': 'multi_video',
  537. 'id': video_id,
  538. 'title': title,
  539. 'entries': entries,
  540. }
  541. else:
  542. info = entries[0]
  543. info['id'] = video_id
  544. info['title'] = title
  545. return info