瀏覽代碼

[JSInterp] Add tests and relevant functionality from yt-dlp
* thx seproDev, bashonly: yt-dlp/yt-dlp#12760, yt-dlp/yt-dlp#12761:
- Improve nested attribute support
- Pass global stack when extracting objects
- interpret_statement: Match attribute before indexing
- Fix assignment to array elements with nested brackets
- Add new signature tests
- Invalidate JS function cache
- Avoid testdata dupes now that we cache by URL

* rework nsig function name search
* fully fixes #33102
* update cache required versions
* update program version

dirkf 4 月之前
父節點
當前提交
c052a16f72
共有 5 個文件被更改,包括 129 次插入22 次删除
  1. 16 0
      test/test_jsinterp.py
  2. 62 2
      test/test_youtube_signature.py
  3. 28 5
      youtube_dl/extractor/youtube.py
  4. 22 14
      youtube_dl/jsinterp.py
  5. 1 1
      youtube_dl/version.py

+ 16 - 0
test/test_jsinterp.py

@@ -180,6 +180,7 @@ class TestJSInterpreter(unittest.TestCase):
         self._test('function f(){var x = 20; x = 30 + 1; return x;}', 31)
         self._test('function f(){var x = 20; x += 30 + 1; return x;}', 51)
         self._test('function f(){var x = 20; x -= 30 + 1; return x;}', -11)
+        self._test('function f(){var x = 2; var y = ["a", "b"]; y[x%y["length"]]="z"; return y}', ['z', 'b'])
 
     def test_comments(self):
         self._test('''
@@ -552,6 +553,8 @@ class TestJSInterpreter(unittest.TestCase):
         test_result = list('test')
         tests = [
             'function f(a, b){return a.split(b)}',
+            'function f(a, b){return a["split"](b)}',
+            'function f(a, b){let x = ["split"]; return a[x[0]](b)}',
             'function f(a, b){return String.prototype.split.call(a, b)}',
             'function f(a, b){return String.prototype.split.apply(a, [b])}',
         ]
@@ -602,6 +605,9 @@ class TestJSInterpreter(unittest.TestCase):
         self._test('function f(){return "012345678".slice(-1, 1)}', '')
         self._test('function f(){return "012345678".slice(-3, -1)}', '67')
 
+    def test_splice(self):
+        self._test('function f(){var T = ["0", "1", "2"]; T["splice"](2, 1, "0")[0]; return T }', ['0', '1', '0'])
+
     def test_pop(self):
         # pop
         self._test('function f(){var a = [0, 1, 2, 3, 4, 5, 6, 7, 8]; return [a.pop(), a]}',
@@ -636,6 +642,16 @@ class TestJSInterpreter(unittest.TestCase):
                    'return [ret.length, ret[0][0], ret[1][1], ret[0][2]]}',
                    [2, 4, 1, [4, 2]])
 
+    def test_extract_function(self):
+        jsi = JSInterpreter('function a(b) { return b + 1; }')
+        func = jsi.extract_function('a')
+        self.assertEqual(func([2]), 3)
+
+    def test_extract_function_with_global_stack(self):
+        jsi = JSInterpreter('function c(d) { return d + e + f + g; }')
+        func = jsi.extract_function('c', {'e': 10}, {'f': 100, 'g': 1000})
+        self.assertEqual(func([1]), 1111)
+
 
 if __name__ == '__main__':
     unittest.main()

+ 62 - 2
test/test_youtube_signature.py

@@ -94,11 +94,51 @@ _SIG_TESTS = [
         '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
         '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
     ),
+    (
+        'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+    ),
     (
         'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js',
         '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
         'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
     ),
+    (
+        'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js',
+        '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA',
+        'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0',
+    ),
 ]
 
 _NSIG_TESTS = [
@@ -272,7 +312,7 @@ _NSIG_TESTS = [
     ),
     (
         'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js',
-        'W9HJZKktxuYoDTqW', 'larxUlagTRAcSw',
+        'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA',
     ),
     (
         'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js',
@@ -286,6 +326,26 @@ _NSIG_TESTS = [
         'https://www.youtube.com/s/player/4fcd6e4a/tv-player-ias.vflset/tv-player-ias.js',
         'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A',
     ),
+    (
+        'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js',
+        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js',
+        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4',
+    ),
+    (
+        'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js',
+        'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js',
+        'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE',
+    ),
+    (
+        'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js',
+        'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE',
+    ),
 ]
 
 
@@ -335,7 +395,7 @@ def t_factory(name, sig_func, url_pattern):
         test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id'))
 
         def test_func(self):
-            basename = 'player-{0}-{1}.js'.format(name, test_id)
+            basename = 'player-{0}.js'.format(test_id)
             fn = os.path.join(self.TESTDATA_DIR, basename)
 
             if not os.path.exists(fn):

+ 28 - 5
youtube_dl/extractor/youtube.py

@@ -1652,7 +1652,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         assert os.path.basename(func_id) == func_id
 
         self.write_debug('Extracting signature function {0}'.format(func_id))
-        cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None
+        cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.04.07'), None
 
         if not cache_spec:
             code = self._load_player(video_id, player_url, player_id)
@@ -1813,6 +1813,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         return ret
 
     def _extract_n_function_name(self, jscode):
+        func_name, idx = None, None
+        # these special cases are redundant and probably obsolete (2025-04):
+        # they make the tests run ~10% faster without fallback warnings
+        r"""
         func_name, idx = self._search_regex(
             # (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};
             # (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}};
@@ -1839,9 +1843,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     \(\s*[\w$]+\s*\)
             ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
             default=(None, None))
+        """
+
+        if not func_name:
+            # nfunc=function(x){...}|function nfunc(x); ...
+            # ... var y=[nfunc]|y[idx]=nfunc);
+            # obvious REs hang, so use a two-stage tactic
+            for m in re.finditer(r'''(?x)
+                    [\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s*
+                        (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\])
+                    \s*?[;\n]
+                    ''', jscode):
+                func_name = self._search_regex(
+                    r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format(
+                        re.escape(m.group('nfunc')), '{'),
+                    jscode, 'Initial JS player n function name (2)', group=2, default=None)
+                if func_name:
+                    idx = m.group('idx')
+                    break
+
         # thx bashonly: yt-dlp/yt-dlp/pull/10611
         if not func_name:
-            self.report_warning('Falling back to generic n function search')
+            self.report_warning('Falling back to generic n function search', only_once=True)
             return self._search_regex(
                 r'''(?xs)
                     (?:(?<=[^\w$])|^)       # instead of \b, which ignores $
@@ -1855,14 +1878,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             return func_name
 
         return self._search_json(
-            r'var\s+{0}\s*='.format(re.escape(func_name)), jscode,
+            r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode,
             'Initial JS player n function list ({0}.{1})'.format(func_name, idx),
-            func_name, contains_pattern=r'\[[\s\S]+\]', end_pattern='[,;]',
+            func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]',
             transform_source=js_to_json)[int(idx)]
 
     def _extract_n_function_code(self, video_id, player_url):
         player_id = self._extract_player_info(player_url)
-        func_code = self.cache.load('youtube-nsig', player_id)
+        func_code = self.cache.load('youtube-nsig', player_id, min_ver='2025.04.07')
         jscode = func_code or self._load_player(video_id, player_url)
         jsi = JSInterpreter(jscode)
 

+ 22 - 14
youtube_dl/jsinterp.py

@@ -303,8 +303,6 @@ _UNARY_OPERATORS_X = (
     ('!', _js_unary_op(lambda x: _js_ternary(x, if_true=False, if_false=True))),
 )
 
-_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS))
-
 _COMP_OPERATORS = (
     ('===', _js_id_op(operator.is_)),
     ('!==', _js_id_op(operator.is_not)),
@@ -316,9 +314,12 @@ _COMP_OPERATORS = (
     ('>', _js_comp_op(operator.gt)),
 )
 
+_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS + _SC_OPERATORS))
+
 _NAME_RE = r'[a-zA-Z_$][\w$]*'
 _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))
 _QUOTES = '\'"/'
+_NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?'
 
 
 class JS_Break(ExtractorError):
@@ -1088,15 +1089,18 @@ class JSInterpreter(object):
 
         m = re.match(r'''(?x)
             (?P<assign>
-                (?P<out>{_NAME_RE})(?:\[(?P<out_idx>(?:.+?\]\s*\[)*.+?)\])?\s*
+                (?P<out>{_NAME_RE})(?P<out_idx>(?:\[{_NESTED_BRACKETS}\])+)?\s*
                 (?P<op>{_OPERATOR_RE})?
                 =(?!=)(?P<expr>.*)$
             )|(?P<return>
                 (?!if|return|true|false|null|undefined|NaN|Infinity)(?P<name>{_NAME_RE})$
-            )|(?P<indexing>
-                (?P<in>{_NAME_RE})\[(?P<in_idx>(?:.+?\]\s*\[)*.+?)\]$
             )|(?P<attribute>
-                (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s*
+                (?P<var>{_NAME_RE})(?:
+                    (?P<nullish>\?)?\.(?P<member>[^(]+)|
+                    \[(?P<member2>{_NESTED_BRACKETS})\]
+                )\s*
+            )|(?P<indexing>
+                (?P<in>{_NAME_RE})(?P<in_idx>\[.+\])$
             )|(?P<function>
                 (?P<fname>{_NAME_RE})\((?P<args>.*)\)$
             )'''.format(**globals()), expr)
@@ -1111,10 +1115,11 @@ class JSInterpreter(object):
             elif left_val in (None, JS_Undefined):
                 raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr)
 
-            indexes = re.split(r'\]\s*\[', m.group('out_idx'))
-            for i, idx in enumerate(indexes, 1):
+            indexes = md['out_idx']
+            while indexes:
+                idx, indexes = self._separate_at_paren(indexes)
                 idx = self.interpret_expression(idx, local_vars, allow_recursion)
-                if i < len(indexes):
+                if indexes:
                     left_val = self._index(left_val, idx)
             if isinstance(idx, float):
                 idx = int(idx)
@@ -1159,7 +1164,9 @@ class JSInterpreter(object):
 
         if md.get('indexing'):
             val = local_vars[m.group('in')]
-            for idx in re.split(r'\]\s*\[', m.group('in_idx')):
+            indexes = m.group('in_idx')
+            while indexes:
+                idx, indexes = self._separate_at_paren(indexes)
                 idx = self.interpret_expression(idx, local_vars, allow_recursion)
                 val = self._index(val, idx)
             return val, should_return
@@ -1204,7 +1211,7 @@ class JSInterpreter(object):
                 if obj is JS_Undefined:
                     try:
                         if variable not in self._objects:
-                            self._objects[variable] = self.extract_object(variable)
+                            self._objects[variable] = self.extract_object(variable, local_vars)
                         obj = self._objects[variable]
                     except self.Exception:
                         if not nullish:
@@ -1215,7 +1222,7 @@ class JSInterpreter(object):
 
                 # Member access
                 if arg_str is None:
-                    return self._index(obj, member)
+                    return self._index(obj, member, nullish)
 
                 # Function call
                 argvals = [
@@ -1400,7 +1407,7 @@ class JSInterpreter(object):
         for v in self._separate(list_txt):
             yield self.interpret_expression(v, local_vars, allow_recursion)
 
-    def extract_object(self, objname):
+    def extract_object(self, objname, *global_stack):
         _FUNC_NAME_RE = r'''(?:{n}|"{n}"|'{n}')'''.format(n=_NAME_RE)
         obj = {}
         fields = next(filter(None, (
@@ -1421,7 +1428,8 @@ class JSInterpreter(object):
                 fields):
             argnames = self.build_arglist(f.group('args'))
             name = remove_quotes(f.group('key'))
-            obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), 'F<{0}>'.format(name))
+            obj[name] = function_with_repr(
+                self.build_function(argnames, f.group('code'), *global_stack), 'F<{0}>'.format(name))
 
         return obj
 

+ 1 - 1
youtube_dl/version.py

@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2021.12.17'
+__version__ = '2025.04.07'