jsinterp.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. from __future__ import unicode_literals
  2. import json
  3. import operator
  4. import re
  5. from .utils import (
  6. ExtractorError,
  7. remove_quotes,
  8. )
  9. from .compat import (
  10. compat_collections_abc
  11. )
  12. MutableMapping = compat_collections_abc.MutableMapping
  13. class Nonlocal:
  14. pass
  15. _OPERATORS = [
  16. ('|', operator.or_),
  17. ('^', operator.xor),
  18. ('&', operator.and_),
  19. ('>>', operator.rshift),
  20. ('<<', operator.lshift),
  21. ('-', operator.sub),
  22. ('+', operator.add),
  23. ('%', operator.mod),
  24. ('/', operator.truediv),
  25. ('*', operator.mul),
  26. ]
  27. _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
  28. _ASSIGN_OPERATORS.append(('=', (lambda cur, right: right)))
  29. _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
  30. class JS_Break(ExtractorError):
  31. def __init__(self):
  32. ExtractorError.__init__(self, 'Invalid break')
  33. class JS_Continue(ExtractorError):
  34. def __init__(self):
  35. ExtractorError.__init__(self, 'Invalid continue')
  36. class LocalNameSpace(MutableMapping):
  37. def __init__(self, *stack):
  38. self.stack = tuple(stack)
  39. def __getitem__(self, key):
  40. for scope in self.stack:
  41. if key in scope:
  42. return scope[key]
  43. raise KeyError(key)
  44. def __setitem__(self, key, value):
  45. for scope in self.stack:
  46. if key in scope:
  47. scope[key] = value
  48. break
  49. else:
  50. self.stack[0][key] = value
  51. return value
  52. def __delitem__(self, key):
  53. raise NotImplementedError('Deleting is not supported')
  54. def __iter__(self):
  55. for scope in self.stack:
  56. for scope_item in iter(scope):
  57. yield scope_item
  58. def __len__(self, key):
  59. return len(iter(self))
  60. def __repr__(self):
  61. return 'LocalNameSpace%s' % (self.stack, )
  62. class JSInterpreter(object):
  63. def __init__(self, code, objects=None):
  64. if objects is None:
  65. objects = {}
  66. self.code = code
  67. self._functions = {}
  68. self._objects = objects
  69. self.__named_object_counter = 0
  70. def _named_object(self, namespace, obj):
  71. self.__named_object_counter += 1
  72. name = '__youtube_dl_jsinterp_obj%s' % (self.__named_object_counter, )
  73. namespace[name] = obj
  74. return name
  75. @staticmethod
  76. def _separate(expr, delim=',', max_split=None):
  77. if not expr:
  78. return
  79. parens = {'(': 0, '{': 0, '[': 0, ']': 0, '}': 0, ')': 0}
  80. start, splits, pos, max_pos = 0, 0, 0, len(delim) - 1
  81. for idx, char in enumerate(expr):
  82. if char in parens:
  83. parens[char] += 1
  84. is_in_parens = (parens['['] - parens[']']
  85. or parens['('] - parens[')']
  86. or parens['{'] - parens['}'])
  87. if char == delim[pos] and not is_in_parens:
  88. if pos == max_pos:
  89. pos = 0
  90. yield expr[start: idx - max_pos]
  91. start = idx + 1
  92. splits += 1
  93. if max_split and splits >= max_split:
  94. break
  95. else:
  96. pos += 1
  97. else:
  98. pos = 0
  99. yield expr[start:]
  100. @staticmethod
  101. def _separate_at_paren(expr, delim):
  102. separated = list(JSInterpreter._separate(expr, delim, 1))
  103. if len(separated) < 2:
  104. raise ExtractorError('No terminating paren {0} in {1}'.format(delim, expr))
  105. return separated[0][1:].strip(), separated[1].strip()
  106. def interpret_statement(self, stmt, local_vars, allow_recursion=100):
  107. if allow_recursion < 0:
  108. raise ExtractorError('Recursion limit reached')
  109. sub_statements = list(self._separate(stmt, ';'))
  110. stmt = (sub_statements or ['']).pop()
  111. for sub_stmt in sub_statements:
  112. ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1)
  113. if should_abort:
  114. return ret
  115. should_abort = False
  116. stmt = stmt.lstrip()
  117. stmt_m = re.match(r'var\s', stmt)
  118. if stmt_m:
  119. expr = stmt[len(stmt_m.group(0)):]
  120. else:
  121. return_m = re.match(r'return(?:\s+|$)', stmt)
  122. if return_m:
  123. expr = stmt[len(return_m.group(0)):]
  124. should_abort = True
  125. else:
  126. # Try interpreting it as an expression
  127. expr = stmt
  128. v = self.interpret_expression(expr, local_vars, allow_recursion)
  129. return v, should_abort
  130. def interpret_expression(self, expr, local_vars, allow_recursion):
  131. expr = expr.strip()
  132. if expr == '': # Empty expression
  133. return None
  134. if expr.startswith('{'):
  135. inner, outer = self._separate_at_paren(expr, '}')
  136. inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1)
  137. if not outer or should_abort:
  138. return inner
  139. else:
  140. expr = json.dumps(inner) + outer
  141. if expr.startswith('('):
  142. inner, outer = self._separate_at_paren(expr, ')')
  143. inner = self.interpret_expression(inner, local_vars, allow_recursion)
  144. if not outer:
  145. return inner
  146. else:
  147. expr = json.dumps(inner) + outer
  148. if expr.startswith('['):
  149. inner, outer = self._separate_at_paren(expr, ']')
  150. name = self._named_object(local_vars, [
  151. self.interpret_expression(item, local_vars, allow_recursion)
  152. for item in self._separate(inner)])
  153. expr = name + outer
  154. m = re.match(r'try\s*', expr)
  155. if m:
  156. if expr[m.end()] == '{':
  157. try_expr, expr = self._separate_at_paren(expr[m.end():], '}')
  158. else:
  159. try_expr, expr = expr[m.end() - 1:], ''
  160. ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1)
  161. if should_abort:
  162. return ret
  163. return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
  164. m = re.match(r'(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr)
  165. md = m.groupdict() if m else {}
  166. if md.get('catch'):
  167. # We ignore the catch block
  168. _, expr = self._separate_at_paren(expr, '}')
  169. return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
  170. elif md.get('for'):
  171. def raise_constructor_error(c):
  172. raise ExtractorError(
  173. 'Premature return in the initialization of a for loop in {0!r}'.format(c))
  174. constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
  175. if remaining.startswith('{'):
  176. body, expr = self._separate_at_paren(remaining, '}')
  177. else:
  178. m = re.match(r'switch\s*\(', remaining) # FIXME
  179. if m:
  180. switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')')
  181. body, expr = self._separate_at_paren(remaining, '}')
  182. body = 'switch(%s){%s}' % (switch_val, body)
  183. else:
  184. body, expr = remaining, ''
  185. start, cndn, increment = self._separate(constructor, ';')
  186. if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]:
  187. raise_constructor_error(constructor)
  188. while True:
  189. if not self.interpret_expression(cndn, local_vars, allow_recursion):
  190. break
  191. try:
  192. ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1)
  193. if should_abort:
  194. return ret
  195. except JS_Break:
  196. break
  197. except JS_Continue:
  198. pass
  199. if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]:
  200. raise_constructor_error(constructor)
  201. return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
  202. elif md.get('switch'):
  203. switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
  204. switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion)
  205. body, expr = self._separate_at_paren(remaining, '}')
  206. items = body.replace('default:', 'case default:').split('case ')[1:]
  207. for default in (False, True):
  208. matched = False
  209. for item in items:
  210. case, stmt = [i.strip() for i in self._separate(item, ':', 1)]
  211. if default:
  212. matched = matched or case == 'default'
  213. elif not matched:
  214. matched = (case != 'default'
  215. and switch_val == self.interpret_expression(case, local_vars, allow_recursion))
  216. if not matched:
  217. continue
  218. try:
  219. ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1)
  220. if should_abort:
  221. return ret
  222. except JS_Break:
  223. break
  224. if matched:
  225. break
  226. return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
  227. # Comma separated statements
  228. sub_expressions = list(self._separate(expr))
  229. expr = sub_expressions.pop().strip() if sub_expressions else ''
  230. for sub_expr in sub_expressions:
  231. self.interpret_expression(sub_expr, local_vars, allow_recursion)
  232. for m in re.finditer(r'''(?x)
  233. (?P<pre_sign>\+\+|--)(?P<var1>%(_NAME_RE)s)|
  234. (?P<var2>%(_NAME_RE)s)(?P<post_sign>\+\+|--)''' % globals(), expr):
  235. var = m.group('var1') or m.group('var2')
  236. start, end = m.span()
  237. sign = m.group('pre_sign') or m.group('post_sign')
  238. ret = local_vars[var]
  239. local_vars[var] += 1 if sign[0] == '+' else -1
  240. if m.group('pre_sign'):
  241. ret = local_vars[var]
  242. expr = expr[:start] + json.dumps(ret) + expr[end:]
  243. for op, opfunc in _ASSIGN_OPERATORS:
  244. m = re.match(r'''(?x)
  245. (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
  246. \s*%s
  247. (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
  248. if not m:
  249. continue
  250. right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion)
  251. if m.groupdict().get('index'):
  252. lvar = local_vars[m.group('out')]
  253. idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
  254. if not isinstance(idx, int):
  255. raise ExtractorError('List indices must be integers: %s' % (idx, ))
  256. cur = lvar[idx]
  257. val = opfunc(cur, right_val)
  258. lvar[idx] = val
  259. return val
  260. else:
  261. cur = local_vars.get(m.group('out'))
  262. val = opfunc(cur, right_val)
  263. local_vars[m.group('out')] = val
  264. return val
  265. if expr.isdigit():
  266. return int(expr)
  267. if expr == 'break':
  268. raise JS_Break()
  269. elif expr == 'continue':
  270. raise JS_Continue()
  271. var_m = re.match(
  272. r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE,
  273. expr)
  274. if var_m:
  275. return local_vars[var_m.group('name')]
  276. try:
  277. return json.loads(expr)
  278. except ValueError:
  279. pass
  280. m = re.match(
  281. r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
  282. if m:
  283. val = local_vars[m.group('in')]
  284. idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion)
  285. return val[idx]
  286. def raise_expr_error(where, op, exp):
  287. raise ExtractorError('Premature {0} return of {1} in {2!r}'.format(where, op, exp))
  288. for op, opfunc in _OPERATORS:
  289. separated = list(self._separate(expr, op))
  290. if len(separated) < 2:
  291. continue
  292. right_val = separated.pop()
  293. left_val = op.join(separated)
  294. left_val, should_abort = self.interpret_statement(
  295. left_val, local_vars, allow_recursion - 1)
  296. if should_abort:
  297. raise_expr_error('left-side', op, expr)
  298. right_val, should_abort = self.interpret_statement(
  299. right_val, local_vars, allow_recursion - 1)
  300. if should_abort:
  301. raise_expr_error('right-side', op, expr)
  302. return opfunc(left_val or 0, right_val)
  303. m = re.match(
  304. r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE,
  305. expr)
  306. if m:
  307. variable = m.group('var')
  308. nl = Nonlocal()
  309. nl.member = remove_quotes(m.group('member') or m.group('member2'))
  310. arg_str = expr[m.end():]
  311. if arg_str.startswith('('):
  312. arg_str, remaining = self._separate_at_paren(arg_str, ')')
  313. else:
  314. arg_str, remaining = None, arg_str
  315. def assertion(cndn, msg):
  316. """ assert, but without risk of getting optimized out """
  317. if not cndn:
  318. raise ExtractorError('{0} {1}: {2}'.format(nl.member, msg, expr))
  319. def eval_method():
  320. # nonlocal member
  321. member = nl.member
  322. if variable == 'String':
  323. obj = str
  324. elif variable in local_vars:
  325. obj = local_vars[variable]
  326. else:
  327. if variable not in self._objects:
  328. self._objects[variable] = self.extract_object(variable)
  329. obj = self._objects[variable]
  330. if arg_str is None:
  331. # Member access
  332. if member == 'length':
  333. return len(obj)
  334. return obj[member]
  335. # Function call
  336. argvals = [
  337. self.interpret_expression(v, local_vars, allow_recursion)
  338. for v in self._separate(arg_str)]
  339. if obj == str:
  340. if member == 'fromCharCode':
  341. assertion(argvals, 'takes one or more arguments')
  342. return ''.join(map(chr, argvals))
  343. raise ExtractorError('Unsupported string method %s' % (member, ))
  344. if member == 'split':
  345. assertion(argvals, 'takes one or more arguments')
  346. assertion(argvals == [''], 'with arguments is not implemented')
  347. return list(obj)
  348. elif member == 'join':
  349. assertion(isinstance(obj, list), 'must be applied on a list')
  350. assertion(len(argvals) == 1, 'takes exactly one argument')
  351. return argvals[0].join(obj)
  352. elif member == 'reverse':
  353. assertion(not argvals, 'does not take any arguments')
  354. obj.reverse()
  355. return obj
  356. elif member == 'slice':
  357. assertion(isinstance(obj, list), 'must be applied on a list')
  358. assertion(len(argvals) == 1, 'takes exactly one argument')
  359. return obj[argvals[0]:]
  360. elif member == 'splice':
  361. assertion(isinstance(obj, list), 'must be applied on a list')
  362. assertion(argvals, 'takes one or more arguments')
  363. index, howMany = map(int, (argvals + [len(obj)])[:2])
  364. if index < 0:
  365. index += len(obj)
  366. add_items = argvals[2:]
  367. res = []
  368. for i in range(index, min(index + howMany, len(obj))):
  369. res.append(obj.pop(index))
  370. for i, item in enumerate(add_items):
  371. obj.insert(index + i, item)
  372. return res
  373. elif member == 'unshift':
  374. assertion(isinstance(obj, list), 'must be applied on a list')
  375. assertion(argvals, 'takes one or more arguments')
  376. for item in reversed(argvals):
  377. obj.insert(0, item)
  378. return obj
  379. elif member == 'pop':
  380. assertion(isinstance(obj, list), 'must be applied on a list')
  381. assertion(not argvals, 'does not take any arguments')
  382. if not obj:
  383. return
  384. return obj.pop()
  385. elif member == 'push':
  386. assertion(argvals, 'takes one or more arguments')
  387. obj.extend(argvals)
  388. return obj
  389. elif member == 'forEach':
  390. assertion(argvals, 'takes one or more arguments')
  391. assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
  392. f, this = (argvals + [''])[:2]
  393. return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)]
  394. elif member == 'indexOf':
  395. assertion(argvals, 'takes one or more arguments')
  396. assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
  397. idx, start = (argvals + [0])[:2]
  398. try:
  399. return obj.index(idx, start)
  400. except ValueError:
  401. return -1
  402. if isinstance(obj, list):
  403. member = int(member)
  404. nl.member = member
  405. return obj[member](argvals)
  406. if remaining:
  407. return self.interpret_expression(
  408. self._named_object(local_vars, eval_method()) + remaining,
  409. local_vars, allow_recursion)
  410. else:
  411. return eval_method()
  412. m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
  413. if m:
  414. fname = m.group('func')
  415. argvals = tuple([
  416. int(v) if v.isdigit() else local_vars[v]
  417. for v in self._separate(m.group('args'))])
  418. if fname in local_vars:
  419. return local_vars[fname](argvals)
  420. elif fname not in self._functions:
  421. self._functions[fname] = self.extract_function(fname)
  422. return self._functions[fname](argvals)
  423. if expr:
  424. raise ExtractorError('Unsupported JS expression %r' % expr)
  425. def extract_object(self, objname):
  426. _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
  427. obj = {}
  428. obj_m = re.search(
  429. r'''(?x)
  430. (?<!this\.)%s\s*=\s*{\s*
  431. (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
  432. }\s*;
  433. ''' % (re.escape(objname), _FUNC_NAME_RE),
  434. self.code)
  435. fields = obj_m.group('fields')
  436. # Currently, it only supports function definitions
  437. fields_m = re.finditer(
  438. r'''(?x)
  439. (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
  440. ''' % _FUNC_NAME_RE,
  441. fields)
  442. for f in fields_m:
  443. argnames = f.group('args').split(',')
  444. obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
  445. return obj
  446. def extract_function_code(self, funcname):
  447. """ @returns argnames, code """
  448. func_m = re.search(
  449. r'''(?x)
  450. (?:function\s+%(f_n)s|[{;,]\s*%(f_n)s\s*=\s*function|var\s+%(f_n)s\s*=\s*function)\s*
  451. \((?P<args>[^)]*)\)\s*
  452. (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % {'f_n': re.escape(funcname), },
  453. self.code)
  454. code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match
  455. if func_m is None:
  456. raise ExtractorError('Could not find JS function %r' % funcname)
  457. return func_m.group('args').split(','), code
  458. def extract_function(self, funcname):
  459. return self.extract_function_from_code(*self.extract_function_code(funcname))
  460. def extract_function_from_code(self, argnames, code, *global_stack):
  461. local_vars = {}
  462. while True:
  463. mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
  464. if mobj is None:
  465. break
  466. start, body_start = mobj.span()
  467. body, remaining = self._separate_at_paren(code[body_start - 1:], '}')
  468. name = self._named_object(
  469. local_vars,
  470. self.extract_function_from_code(
  471. [str.strip(x) for x in mobj.group('args').split(',')],
  472. body, local_vars, *global_stack))
  473. code = code[:start] + name + remaining
  474. return self.build_function(argnames, code, local_vars, *global_stack)
  475. def call_function(self, funcname, *args):
  476. return self.extract_function(funcname)(args)
  477. def build_function(self, argnames, code, *global_stack):
  478. global_stack = list(global_stack) or [{}]
  479. local_vars = global_stack.pop(0)
  480. def resf(args, **kwargs):
  481. local_vars.update(dict(zip(argnames, args)))
  482. local_vars.update(kwargs)
  483. var_stack = LocalNameSpace(local_vars, *global_stack)
  484. for stmt in self._separate(code.replace('\n', ''), ';'):
  485. ret, should_abort = self.interpret_statement(stmt, var_stack)
  486. if should_abort:
  487. break
  488. return ret
  489. return resf