浏览代码

[jsinterp] Fix and improve split/join
* improve split/join edge cases
* correctly implement regex split (not like re.split)

dirkf 8 月之前
父节点
当前提交
94fd774608
共有 2 个文件被更改,包括 62 次插入5 次删除
  1. 19 0
      test/test_jsinterp.py
  2. 43 5
      youtube_dl/jsinterp.py

+ 19 - 0
test/test_jsinterp.py

@@ -483,6 +483,13 @@ class TestJSInterpreter(unittest.TestCase):
             self._test(jsi, 't-e-s-t', args=[test_input, '-'])
             self._test(jsi, '', args=[[], '-'])
 
+        self._test('function f(){return '
+                   '[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join()}',
+                   '1,1,abc,[object Object],,,Infinity,NaN')
+        self._test('function f(){return '
+                   '[1, 1.0, "abc", {a: 1}, null, undefined, Infinity, NaN].join("~")}',
+                   '1~1~abc~[object Object]~~~Infinity~NaN')
+
     def test_split(self):
         test_result = list('test')
         tests = [
@@ -496,6 +503,18 @@ class TestJSInterpreter(unittest.TestCase):
             self._test(jsi, test_result, args=['t-e-s-t', '-'])
             self._test(jsi, [''], args=['', '-'])
             self._test(jsi, [], args=['', ''])
+        # RegExp split
+        self._test('function f(){return "test".split(/(?:)/)}',
+                   ['t', 'e', 's', 't'])
+        self._test('function f(){return "t-e-s-t".split(/[es-]+/)}',
+                   ['t', 't'])
+        # from MDN: surrogate pairs aren't handled: case 1 fails
+        # self._test('function f(){return "😄😄".split(/(?:)/)}',
+        #            ['\ud83d', '\ude04', '\ud83d', '\ude04'])
+        # case 2 beats Py3.2: it gets the case 1 result
+        if sys.version_info >= (2, 6) and not ((3, 0) <= sys.version_info < (3, 3)):
+            self._test('function f(){return "😄😄".split(/(?:)/u)}',
+                       ['😄', '😄'])
 
     def test_slice(self):
         self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8])

+ 43 - 5
youtube_dl/jsinterp.py

@@ -397,6 +397,9 @@ class JSInterpreter(object):
         RE_FLAGS = {
             # special knowledge: Python's re flags are bitmask values, current max 128
             # invent new bitmask values well above that for literal parsing
+            # JS 'u' flag is effectively always set (surrogate pairs aren't seen),
+            # but \u{...} and \p{...} escapes aren't handled); no additional JS 'v'
+            # features are supported
             # TODO: execute matches with these flags (remaining: d, y)
             'd': 1024,  # Generate indices for substring matches
             'g': 2048,  # Global search
@@ -404,6 +407,7 @@ class JSInterpreter(object):
             'm': re.M,  # Multi-line search
             's': re.S,  # Allows . to match newline characters
             'u': re.U,  # Treat a pattern as a sequence of unicode code points
+            'v': re.U,  # Like 'u' with extended character class and \p{} syntax
             'y': 4096,  # Perform a "sticky" search that matches starting at the current position in the target string
         }
 
@@ -1047,13 +1051,47 @@ class JSInterpreter(object):
                     raise self.Exception('Unsupported Math method ' + member, expr=expr)
 
                 if member == 'split':
-                    assertion(argvals, 'takes one or more arguments')
-                    assertion(len(argvals) == 1, 'with limit argument is not implemented')
-                    return obj.split(argvals[0]) if argvals[0] else list(obj)
+                    assertion(len(argvals) <= 2, 'takes at most two arguments')
+                    if len(argvals) > 1:
+                        limit = argvals[1]
+                        assertion(isinstance(limit, int) and limit >= 0, 'integer limit >= 0')
+                        if limit == 0:
+                            return []
+                    else:
+                        limit = 0
+                    if len(argvals) == 0:
+                        argvals = [JS_Undefined]
+                    elif isinstance(argvals[0], self.JS_RegExp):
+                        # avoid re.split(), similar but not enough
+
+                        def where():
+                            for m in argvals[0].finditer(obj):
+                                yield m.span(0)
+                            yield (None, None)
+
+                        def splits(limit=limit):
+                            i = 0
+                            for j, jj in where():
+                                if j == jj == 0:
+                                    continue
+                                if j is None and i >= len(obj):
+                                    break
+                                yield obj[i:j]
+                                if jj is None or limit == 1:
+                                    break
+                                limit -= 1
+                                i = jj
+
+                        return list(splits())
+                    return (
+                        obj.split(argvals[0], limit - 1) if argvals[0] and argvals[0] != JS_Undefined
+                        else list(obj)[:limit or None])
                 elif member == 'join':
                     assertion(isinstance(obj, list), 'must be applied on a list')
-                    assertion(len(argvals) == 1, 'takes exactly one argument')
-                    return argvals[0].join(obj)
+                    assertion(len(argvals) <= 1, 'takes at most one argument')
+                    return (',' if len(argvals) == 0 else argvals[0]).join(
+                        ('' if x in (None, JS_Undefined) else _js_toString(x))
+                        for x in obj)
                 elif member == 'reverse':
                     assertion(not argvals, 'does not take any arguments')
                     obj.reverse()