Bladeren bron

Merge pull request #7232 from ThomasWaldmann/json_b64

implement and use (text|binary)_to_json
TW 2 jaren geleden
bovenliggende
commit
d49665526c

+ 36 - 2
docs/internals/frontends.rst

@@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale:
     export LC_CTYPE=en_US.UTF-8
     export LC_CTYPE=en_US.UTF-8
 
 
 
 
+Dealing with non-unicode byte sequences and JSON limitations
+------------------------------------------------------------
+
+Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C).
+
+Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems
+still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often
+have wild mixtures of misc. encodings, sometimes even very broken stuff.
+
+borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using
+UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points
+(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence
+will be reproduced exactly.
+
+JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a
+surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content).
+
+Borg deals with this situation like this (since borg 2.0):
+
+For a valid unicode path (no surrogate escapes), the JSON will only have "path": path.
+
+For a non-unicode path (with surrogate escapes), the JSON will have 2 entries:
+
+- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?")
+- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string)
+
+JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is:
+
+- check if there is a "path_b64" key.
+- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has
+  some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise
+  path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte
+  string you'll get. if an approximation is fine, use the value of the "path" key.
+- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding).
+
+
 Logging
 Logging
 -------
 -------
 
 
@@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot
     parsing error will be printed in plain text, because logging set-up happens after all arguments are
     parsing error will be printed in plain text, because logging set-up happens after all arguments are
     parsed.
     parsed.
 
 
-Since JSON can only encode text, any string representing a file system path may miss non-text parts.
-
 The following types are in use. Progress information is governed by the usual rules for progress information,
 The following types are in use. Progress information is governed by the usual rules for progress information,
 it is not produced unless ``--progress`` is specified.
 it is not produced unless ``--progress`` is specified.
 
 

+ 3 - 2
src/borg/archive.py

@@ -32,7 +32,7 @@ from .helpers import Error, IntegrityError, set_ec
 from .platform import uid2user, user2uid, gid2group, group2gid
 from .platform import uid2user, user2uid, gid2group, group2gid
 from .helpers import parse_timestamp, archive_ts_now
 from .helpers import parse_timestamp, archive_ts_now
 from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
 from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
-from .helpers import safe_encode, make_path_safe, remove_surrogates
+from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json
 from .helpers import StableDict
 from .helpers import StableDict
 from .helpers import bin_to_hex
 from .helpers import bin_to_hex
 from .helpers import safe_ns
 from .helpers import safe_ns
@@ -165,7 +165,8 @@ Bytes sent to remote: {stats.tx_bytes}
             if self.output_json:
             if self.output_json:
                 if not final:
                 if not final:
                     data = self.as_dict()
                     data = self.as_dict()
-                    data["path"] = remove_surrogates(item.path if item else "")
+                    if item:
+                        data.update(text_to_json("path", item.path))
                 else:
                 else:
                     data = {}
                     data = {}
                 data.update({"time": time.time(), "type": "archive_progress", "finished": final})
                 data.update({"time": time.time(), "type": "archive_progress", "finished": final})

+ 4 - 5
src/borg/archiver/__init__.py

@@ -26,7 +26,7 @@ try:
     from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE
     from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE
     from ..helpers import Error, set_ec
     from ..helpers import Error, set_ec
     from ..helpers import format_file_size
     from ..helpers import format_file_size
-    from ..helpers import remove_surrogates
+    from ..helpers import remove_surrogates, text_to_json
     from ..helpers import DatetimeWrapper, replace_placeholders
     from ..helpers import DatetimeWrapper, replace_placeholders
     from ..helpers import check_python, check_extension_modules
     from ..helpers import check_python, check_extension_modules
     from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo
     from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo
@@ -139,10 +139,9 @@ class Archiver(
         # if we get called with status == None, the final file status was already printed
         # if we get called with status == None, the final file status was already printed
         if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter):
         if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter):
             if self.log_json:
             if self.log_json:
-                print(
-                    json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}),
-                    file=sys.stderr,
-                )
+                json_data = {"type": "file_status", "status": status}
+                json_data.update(text_to_json("path", path))
+                print(json.dumps(json_data), file=sys.stderr)
             else:
             else:
                 logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path))
                 logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path))
 
 

+ 1 - 1
src/borg/cache.py

@@ -863,7 +863,7 @@ class LocalCache(CacheStatsMixin):
                 )
                 )
                 archive_ids_to_names = get_archive_ids_to_names(archive_ids)
                 archive_ids_to_names = get_archive_ids_to_names(archive_ids)
                 for archive_id, archive_name in archive_ids_to_names.items():
                 for archive_id, archive_name in archive_ids_to_names.items():
-                    pi.show(info=[remove_surrogates(archive_name)])
+                    pi.show(info=[remove_surrogates(archive_name)])  # legacy. borg2 always has pure unicode arch names.
                     if self.do_cache:
                     if self.do_cache:
                         if archive_id in cached_ids:
                         if archive_id in cached_ids:
                             archive_chunk_idx = read_archive_index(archive_id, archive_name)
                             archive_chunk_idx = read_archive_index(archive_id, archive_name)

+ 2 - 1
src/borg/helpers/__init__.py

@@ -19,7 +19,8 @@ from .fs import HardLinkManager
 from .misc import sysinfo, log_multi, consume
 from .misc import sysinfo, log_multi, consume
 from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
 from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
 from .parseformat import bin_to_hex, safe_encode, safe_decode
 from .parseformat import bin_to_hex, safe_encode, safe_decode
-from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
+from .parseformat import text_to_json, binary_to_json, remove_surrogates
+from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
 from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
 from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
 from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
 from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
 from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal
 from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal

+ 53 - 16
src/borg/helpers/parseformat.py

@@ -1,4 +1,5 @@
 import argparse
 import argparse
+import base64
 import hashlib
 import hashlib
 import json
 import json
 import os
 import os
@@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
     return s.encode("utf-8", errors).decode("utf-8")
     return s.encode("utf-8", errors).decode("utf-8")
 
 
 
 
+def binary_to_json(key, value):
+    assert isinstance(key, str)
+    assert isinstance(value, bytes)
+    return {key + "_b64": base64.b64encode(value).decode("ascii")}
+
+
+def text_to_json(key, value):
+    """
+    Return a dict made from key/value that can be fed safely into a JSON encoder.
+
+    JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
+
+    But sometimes we have to deal with such values and we do it like this:
+    - <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
+    - <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
+    """
+    coding = "utf-8"
+    assert isinstance(key, str)
+    assert isinstance(value, str)  # str might contain surrogate escapes
+    data = {}
+    try:
+        value.encode(coding, errors="strict")  # check if pure unicode
+    except UnicodeEncodeError:
+        # value has surrogate escape sequences
+        value_replace_encoded = value.encode(coding, errors="replace")
+        data[key] = value_replace_encoded.decode(coding, errors="strict")
+        value_bytes = value.encode(coding, errors="surrogateescape")
+        data.update(binary_to_json(key, value_bytes))
+    else:
+        # value is pure unicode
+        data[key] = value
+        # we do not give the b64 representation, not needed
+    return data
+
+
 def eval_escapes(s):
 def eval_escapes(s):
     """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
     """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
     return s.encode("ascii", "backslashreplace").decode("unicode-escape")
     return s.encode("ascii", "backslashreplace").decode("unicode-escape")
@@ -681,7 +717,7 @@ class ArchiveFormatter(BaseFormatter):
         self.call_keys = {
         self.call_keys = {
             "hostname": partial(self.get_meta, "hostname", rs=True),
             "hostname": partial(self.get_meta, "hostname", rs=True),
             "username": partial(self.get_meta, "username", rs=True),
             "username": partial(self.get_meta, "username", rs=True),
-            "comment": partial(self.get_meta, "comment", rs=True),
+            "comment": partial(self.get_meta, "comment", rs=False),
             "end": self.get_ts_end,
             "end": self.get_ts_end,
             "command_line": self.get_cmdline,
             "command_line": self.get_cmdline,
         }
         }
@@ -702,8 +738,8 @@ class ArchiveFormatter(BaseFormatter):
         item_data.update(self.item_data)
         item_data.update(self.item_data)
         item_data.update(
         item_data.update(
             {
             {
-                "name": remove_surrogates(archive_info.name),
-                "archive": remove_surrogates(archive_info.name),
+                "name": archive_info.name,
+                "archive": archive_info.name,
                 "id": bin_to_hex(archive_info.id),
                 "id": bin_to_hex(archive_info.id),
                 "time": self.format_time(archive_info.ts),
                 "time": self.format_time(archive_info.ts),
                 "start": self.format_time(archive_info.ts),
                 "start": self.format_time(archive_info.ts),
@@ -840,31 +876,32 @@ class ItemFormatter(BaseFormatter):
     def get_item_data(self, item):
     def get_item_data(self, item):
         item_data = {}
         item_data = {}
         item_data.update(self.item_data)
         item_data.update(self.item_data)
-        mode = stat.filemode(item.mode)
-        item_type = mode[0]
 
 
+        item_data.update(text_to_json("path", item.path))
         source = item.get("source", "")
         source = item.get("source", "")
-        extra = ""
-        if source:
-            source = remove_surrogates(source)
-            extra = " -> %s" % source
+        item_data.update(text_to_json("source", source))
+        item_data.update(text_to_json("linktarget", source))
+        if not self.json_lines:
+            item_data["extra"] = "" if not source else f" -> {item_data['source']}"
+
         hlid = item.get("hlid")
         hlid = item.get("hlid")
         hlid = bin_to_hex(hlid) if hlid else ""
         hlid = bin_to_hex(hlid) if hlid else ""
+        item_data["hlid"] = hlid
+
+        mode = stat.filemode(item.mode)
+        item_type = mode[0]
         item_data["type"] = item_type
         item_data["type"] = item_type
         item_data["mode"] = mode
         item_data["mode"] = mode
-        item_data["user"] = item.get("user", str(item.uid))
-        item_data["group"] = item.get("group", str(item.gid))
+
+        item_data.update(text_to_json("user", item.get("user", str(item.uid))))
+        item_data.update(text_to_json("group", item.get("group", str(item.gid))))
         item_data["uid"] = item.uid
         item_data["uid"] = item.uid
         item_data["gid"] = item.gid
         item_data["gid"] = item.gid
-        item_data["path"] = remove_surrogates(item.path)
+
         if self.json_lines:
         if self.json_lines:
             item_data["healthy"] = "chunks_healthy" not in item
             item_data["healthy"] = "chunks_healthy" not in item
         else:
         else:
-            item_data["extra"] = extra
             item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
             item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
-        item_data["source"] = source
-        item_data["linktarget"] = source
-        item_data["hlid"] = hlid
         item_data["flags"] = item.get("bsdflags")  # int if flags known, else (if flags unknown) None
         item_data["flags"] = item.get("bsdflags")  # int if flags known, else (if flags unknown) None
         for key in self.used_call_keys:
         for key in self.used_call_keys:
             item_data[key] = self.call_keys[key](item)
             item_data[key] = self.call_keys[key](item)

+ 42 - 0
src/borg/testsuite/helpers.py

@@ -1,3 +1,4 @@
+import base64
 import errno
 import errno
 import getpass
 import getpass
 import hashlib
 import hashlib
@@ -42,6 +43,7 @@ from ..helpers import dash_open
 from ..helpers import iter_separated
 from ..helpers import iter_separated
 from ..helpers import eval_escapes
 from ..helpers import eval_escapes
 from ..helpers import safe_unlink
 from ..helpers import safe_unlink
+from ..helpers import text_to_json, binary_to_json
 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
 from ..platform import is_cygwin
 from ..platform import is_cygwin
 
 
@@ -53,6 +55,46 @@ def test_bin_to_hex():
     assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
     assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
 
 
 
 
+@pytest.mark.parametrize(
+    "key,value",
+    [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
+)
+def test_binary_to_json(key, value):
+    key_b64 = key + "_b64"
+    d = binary_to_json(key, value)
+    assert key_b64 in d
+    assert base64.b64decode(d[key_b64]) == value
+
+
+@pytest.mark.parametrize(
+    "key,value,strict",
+    [
+        ("key", "abc", True),
+        ("key", "äöü", True),
+        ("key", "", True),
+        ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
+        ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
+    ],
+)
+def test_text_to_json(key, value, strict):
+    key_b64 = key + "_b64"
+    d = text_to_json(key, value)
+    value_b = value.encode("utf-8", errors="surrogateescape")
+    if strict:
+        # no surrogate-escapes, just unicode text
+        assert key in d
+        assert d[key] == value_b.decode("utf-8", errors="strict")
+        assert d[key].encode("utf-8", errors="strict") == value_b
+        assert key_b64 not in d  # not needed. pure valid unicode.
+    else:
+        # requiring surrogate-escapes. text has replacement chars, base64 representation is present.
+        assert key in d
+        assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
+        assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
+        assert key_b64 in d
+        assert base64.b64decode(d[key_b64]) == value_b
+
+
 class TestLocationWithoutEnv:
 class TestLocationWithoutEnv:
     @pytest.fixture
     @pytest.fixture
     def keys_dir(self, tmpdir, monkeypatch):
     def keys_dir(self, tmpdir, monkeypatch):