浏览代码

implement text_to_json / binary_to_json, see #6151

binary bytes:
- json_key = <key>_b64
- json_value == base64(value)

text (potentially with surrogate escapes):
- json_key1 = <key>
- json_value1 = value_text (s-e replaced by ?)
- json_key2 = <key>_b64
- json_value2 = base64(value_binary)

json_key2/_value2 is only present if value_text required
replacement of surrogate escapes (and thus does not represent
the original value, but just an approximation).
value_binary then gives the original bytes value (e.g. a
non-utf8 bytes sequence).
Thomas Waldmann 2 年之前
父节点
当前提交
32d430a1b0
共有 3 个文件被更改,包括 80 次插入1 次删除
  1. 2 1
      src/borg/helpers/__init__.py
  2. 36 0
      src/borg/helpers/parseformat.py
  3. 42 0
      src/borg/testsuite/helpers.py

+ 2 - 1
src/borg/helpers/__init__.py

@@ -19,7 +19,8 @@ from .fs import HardLinkManager
 from .misc import sysinfo, log_multi, consume
 from .misc import sysinfo, log_multi, consume
 from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
 from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
 from .parseformat import bin_to_hex, safe_encode, safe_decode
 from .parseformat import bin_to_hex, safe_encode, safe_decode
-from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
+from .parseformat import text_to_json, binary_to_json, remove_surrogates
+from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
 from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
 from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
 from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
 from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
 from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal
 from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal

+ 36 - 0
src/borg/helpers/parseformat.py

@@ -1,4 +1,5 @@
 import argparse
 import argparse
+import base64
 import hashlib
 import hashlib
 import json
 import json
 import os
 import os
@@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
     return s.encode("utf-8", errors).decode("utf-8")
     return s.encode("utf-8", errors).decode("utf-8")
 
 
 
 
+def binary_to_json(key, value):
+    assert isinstance(key, str)
+    assert isinstance(value, bytes)
+    return {key + "_b64": base64.b64encode(value).decode("ascii")}
+
+
+def text_to_json(key, value):
+    """
+    Return a dict made from key/value that can be fed safely into a JSON encoder.
+
+    JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
+
+    But sometimes we have to deal with such values and we do it like this:
+    - <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
+    - <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
+    """
+    coding = "utf-8"
+    assert isinstance(key, str)
+    assert isinstance(value, str)  # str might contain surrogate escapes
+    data = {}
+    try:
+        value.encode(coding, errors="strict")  # check if pure unicode
+    except UnicodeEncodeError:
+        # value has surrogate escape sequences
+        value_replace_encoded = value.encode(coding, errors="replace")
+        data[key] = value_replace_encoded.decode(coding, errors="strict")
+        value_bytes = value.encode(coding, errors="surrogateescape")
+        data.update(binary_to_json(key, value_bytes))
+    else:
+        # value is pure unicode
+        data[key] = value
+        # we do not give the b64 representation, not needed
+    return data
+
+
 def eval_escapes(s):
 def eval_escapes(s):
     """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
     """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
     return s.encode("ascii", "backslashreplace").decode("unicode-escape")
     return s.encode("ascii", "backslashreplace").decode("unicode-escape")

+ 42 - 0
src/borg/testsuite/helpers.py

@@ -1,3 +1,4 @@
+import base64
 import errno
 import errno
 import getpass
 import getpass
 import hashlib
 import hashlib
@@ -42,6 +43,7 @@ from ..helpers import dash_open
 from ..helpers import iter_separated
 from ..helpers import iter_separated
 from ..helpers import eval_escapes
 from ..helpers import eval_escapes
 from ..helpers import safe_unlink
 from ..helpers import safe_unlink
+from ..helpers import text_to_json, binary_to_json
 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
 from ..platform import is_cygwin
 from ..platform import is_cygwin
 
 
@@ -53,6 +55,46 @@ def test_bin_to_hex():
     assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
     assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
 
 
 
 
+@pytest.mark.parametrize(
+    "key,value",
+    [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
+)
+def test_binary_to_json(key, value):
+    key_b64 = key + "_b64"
+    d = binary_to_json(key, value)
+    assert key_b64 in d
+    assert base64.b64decode(d[key_b64]) == value
+
+
+@pytest.mark.parametrize(
+    "key,value,strict",
+    [
+        ("key", "abc", True),
+        ("key", "äöü", True),
+        ("key", "", True),
+        ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
+        ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
+    ],
+)
+def test_text_to_json(key, value, strict):
+    key_b64 = key + "_b64"
+    d = text_to_json(key, value)
+    value_b = value.encode("utf-8", errors="surrogateescape")
+    if strict:
+        # no surrogate-escapes, just unicode text
+        assert key in d
+        assert d[key] == value_b.decode("utf-8", errors="strict")
+        assert d[key].encode("utf-8", errors="strict") == value_b
+        assert key_b64 not in d  # not needed. pure valid unicode.
+    else:
+        # requiring surrogate-escapes. text has replacement chars, base64 representation is present.
+        assert key in d
+        assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
+        assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
+        assert key_b64 in d
+        assert base64.b64decode(d[key_b64]) == value_b
+
+
 class TestLocationWithoutEnv:
 class TestLocationWithoutEnv:
     @pytest.fixture
     @pytest.fixture
     def keys_dir(self, tmpdir, monkeypatch):
     def keys_dir(self, tmpdir, monkeypatch):