2 năm trước cách đây · d49665526c
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@@ -29,6 +29,42 @@ On POSIX systems, you can usually set environment vars to choose a UTF-8 locale:
 
				     export LC_CTYPE=en_US.UTF-8
			
 
				 
			
 
				 
			
 
				+Dealing with non-unicode byte sequences and JSON limitations
			
 
				+------------------------------------------------------------
			
 
				+
			
 
				+Paths on POSIX systems can have arbitrary bytes in them (except 0x00 which is used as string terminator in C).
			
 
				+
			
 
				+Nowadays, UTF-8 encoded paths (which decode to valid unicode) are the usual thing, but a lot of systems
			
 
				+still have paths from the past, when other, non-unicode codings were used. Especially old Samba shares often
			
 
				+have wild mixtures of misc. encodings, sometimes even very broken stuff.
			
 
				+
			
 
				+borg deals with such non-unicode paths ("with funny/broken characters") by decoding such byte sequences using
			
 
				+UTF-8 coding and "surrogateescape" error handling mode, which maps invalid bytes to special unicode code points
			
 
				+(surrogate escapes). When encoding such a unicode string back to a byte sequence, the original byte sequence
			
 
				+will be reproduced exactly.
			
 
				+
			
 
				+JSON should only contain valid unicode text without any surrogate escapes, so we can't just directly have a
			
 
				+surrogate-escaped path in JSON ("path" is only one example, this also affects other text-like content).
			
 
				+
			
 
				+Borg deals with this situation like this (since borg 2.0):
			
 
				+
			
 
				+For a valid unicode path (no surrogate escapes), the JSON will only have "path": path.
			
 
				+
			
 
				+For a non-unicode path (with surrogate escapes), the JSON will have 2 entries:
			
 
				+
			
 
				+- "path": path_approximation (pure valid unicode, all invalid bytes will show up as "?")
			
 
				+- "path_b64": path_bytes_base64_encoded (if you decode the base64, you get the original path byte string)
			
 
				+
			
 
				+JSON users need to pick whatever suits their needs best. The suggested procedure (shown for "path") is:
			
 
				+
			
 
				+- check if there is a "path_b64" key.
			
 
				+- if it is there, you will know that the original bytes path did not cleanly UTF-8-decode into unicode (has
			
 
				+  some invalid bytes) and that the string given by the "path" key is only an approximation, but not the precise
			
 
				+  path. if you need precision, you must base64-decode the value of "path_b64" and deal with the arbitrary byte
			
 
				+  string you'll get. if an approximation is fine, use the value of the "path" key.
			
 
				+- if it is not there, the value of the "path" key is all you need (the original bytes path is its UTF-8 encoding).
			
 
				+
			
 
				+
			
 
				 Logging
			
 
				 -------
			
 
				 
			
@@ -40,8 +76,6 @@ where each line is a JSON object. The *type* key of the object determines its ot
 
				     parsing error will be printed in plain text, because logging set-up happens after all arguments are
			
 
				     parsed.
			
 
				 
			
 
				-Since JSON can only encode text, any string representing a file system path may miss non-text parts.
			
 
				-
			
 
				 The following types are in use. Progress information is governed by the usual rules for progress information,
			
 
				 it is not produced unless ``--progress`` is specified.
			
 
				 
			
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -32,7 +32,7 @@ from .helpers import Error, IntegrityError, set_ec
 
				 from .platform import uid2user, user2uid, gid2group, group2gid
			
 
				 from .helpers import parse_timestamp, archive_ts_now
			
 
				 from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize
			
 
				-from .helpers import safe_encode, make_path_safe, remove_surrogates
			
 
				+from .helpers import safe_encode, make_path_safe, remove_surrogates, text_to_json
			
 
				 from .helpers import StableDict
			
 
				 from .helpers import bin_to_hex
			
 
				 from .helpers import safe_ns
			
@@ -165,7 +165,8 @@ Bytes sent to remote: {stats.tx_bytes}
 
				             if self.output_json:
			
 
				                 if not final:
			
 
				                     data = self.as_dict()
			
 
				-                    data["path"] = remove_surrogates(item.path if item else "")
			
 
				+                    if item:
			
 
				+                        data.update(text_to_json("path", item.path))
			
 
				                 else:
			
 
				                     data = {}
			
 
				                 data.update({"time": time.time(), "type": "archive_progress", "finished": final})
			
--- a/src/borg/archiver/__init__.py
+++ b/src/borg/archiver/__init__.py
@@ -26,7 +26,7 @@ try:
 
				     from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, EXIT_SIGNAL_BASE
			
 
				     from ..helpers import Error, set_ec
			
 
				     from ..helpers import format_file_size
			
 
				-    from ..helpers import remove_surrogates
			
 
				+    from ..helpers import remove_surrogates, text_to_json
			
 
				     from ..helpers import DatetimeWrapper, replace_placeholders
			
 
				     from ..helpers import check_python, check_extension_modules
			
 
				     from ..helpers import is_slow_msgpack, is_supported_msgpack, sysinfo
			
@@ -139,10 +139,9 @@ class Archiver(
 
				         # if we get called with status == None, the final file status was already printed
			
 
				         if self.output_list and status is not None and (self.output_filter is None or status in self.output_filter):
			
 
				             if self.log_json:
			
 
				-                print(
			
 
				-                    json.dumps({"type": "file_status", "status": status, "path": remove_surrogates(path)}),
			
 
				-                    file=sys.stderr,
			
 
				-                )
			
 
				+                json_data = {"type": "file_status", "status": status}
			
 
				+                json_data.update(text_to_json("path", path))
			
 
				+                print(json.dumps(json_data), file=sys.stderr)
			
 
				             else:
			
 
				                 logging.getLogger("borg.output.list").info("%1s %s", status, remove_surrogates(path))
			
 
				 
			
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -863,7 +863,7 @@ class LocalCache(CacheStatsMixin):
 
				                 )
			
 
				                 archive_ids_to_names = get_archive_ids_to_names(archive_ids)
			
 
				                 for archive_id, archive_name in archive_ids_to_names.items():
			
 
				-                    pi.show(info=[remove_surrogates(archive_name)])
			
 
				+                    pi.show(info=[remove_surrogates(archive_name)])  # legacy. borg2 always has pure unicode arch names.
			
 
				                     if self.do_cache:
			
 
				                         if archive_id in cached_ids:
			
 
				                             archive_chunk_idx = read_archive_index(archive_id, archive_name)
			
--- a/src/borg/helpers/__init__.py
+++ b/src/borg/helpers/__init__.py
@@ -19,7 +19,8 @@ from .fs import HardLinkManager
 
				 from .misc import sysinfo, log_multi, consume
			
 
				 from .misc import ChunkIteratorFileWrapper, open_item, chunkit, iter_separated, ErrorIgnoringTextIOWrapper
			
 
				 from .parseformat import bin_to_hex, safe_encode, safe_decode
			
 
				-from .parseformat import remove_surrogates, eval_escapes, decode_dict, positive_int_validator, interval
			
 
				+from .parseformat import text_to_json, binary_to_json, remove_surrogates
			
 
				+from .parseformat import eval_escapes, decode_dict, positive_int_validator, interval
			
 
				 from .parseformat import SortBySpec, ChunkerParams, FilesCacheMode, partial_format, DatetimeWrapper
			
 
				 from .parseformat import format_file_size, parse_file_size, FileSize, parse_storage_quota
			
 
				 from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal
			
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -1,4 +1,5 @@
 
				 import argparse
			
 
				+import base64
			
 
				 import hashlib
			
 
				 import json
			
 
				 import os
			
@@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
 
				     return s.encode("utf-8", errors).decode("utf-8")
			
 
				 
			
 
				 
			
 
				+def binary_to_json(key, value):
			
 
				+    assert isinstance(key, str)
			
 
				+    assert isinstance(value, bytes)
			
 
				+    return {key + "_b64": base64.b64encode(value).decode("ascii")}
			
 
				+
			
 
				+
			
 
				+def text_to_json(key, value):
			
 
				+    """
			
 
				+    Return a dict made from key/value that can be fed safely into a JSON encoder.
			
 
				+
			
 
				+    JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
			
 
				+
			
 
				+    But sometimes we have to deal with such values and we do it like this:
			
 
				+    - <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
			
 
				+    - <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
			
 
				+    """
			
 
				+    coding = "utf-8"
			
 
				+    assert isinstance(key, str)
			
 
				+    assert isinstance(value, str)  # str might contain surrogate escapes
			
 
				+    data = {}
			
 
				+    try:
			
 
				+        value.encode(coding, errors="strict")  # check if pure unicode
			
 
				+    except UnicodeEncodeError:
			
 
				+        # value has surrogate escape sequences
			
 
				+        value_replace_encoded = value.encode(coding, errors="replace")
			
 
				+        data[key] = value_replace_encoded.decode(coding, errors="strict")
			
 
				+        value_bytes = value.encode(coding, errors="surrogateescape")
			
 
				+        data.update(binary_to_json(key, value_bytes))
			
 
				+    else:
			
 
				+        # value is pure unicode
			
 
				+        data[key] = value
			
 
				+        # we do not give the b64 representation, not needed
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				 def eval_escapes(s):
			
 
				     """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
			
 
				     return s.encode("ascii", "backslashreplace").decode("unicode-escape")
			
@@ -681,7 +717,7 @@ class ArchiveFormatter(BaseFormatter):
 
				         self.call_keys = {
			
 
				             "hostname": partial(self.get_meta, "hostname", rs=True),
			
 
				             "username": partial(self.get_meta, "username", rs=True),
			
 
				-            "comment": partial(self.get_meta, "comment", rs=True),
			
 
				+            "comment": partial(self.get_meta, "comment", rs=False),
			
 
				             "end": self.get_ts_end,
			
 
				             "command_line": self.get_cmdline,
			
 
				         }
			
@@ -702,8 +738,8 @@ class ArchiveFormatter(BaseFormatter):
 
				         item_data.update(self.item_data)
			
 
				         item_data.update(
			
 
				             {
			
 
				-                "name": remove_surrogates(archive_info.name),
			
 
				-                "archive": remove_surrogates(archive_info.name),
			
 
				+                "name": archive_info.name,
			
 
				+                "archive": archive_info.name,
			
 
				                 "id": bin_to_hex(archive_info.id),
			
 
				                 "time": self.format_time(archive_info.ts),
			
 
				                 "start": self.format_time(archive_info.ts),
			
@@ -840,31 +876,32 @@ class ItemFormatter(BaseFormatter):
 
				     def get_item_data(self, item):
			
 
				         item_data = {}
			
 
				         item_data.update(self.item_data)
			
 
				-        mode = stat.filemode(item.mode)
			
 
				-        item_type = mode[0]
			
 
				 
			
 
				+        item_data.update(text_to_json("path", item.path))
			
 
				         source = item.get("source", "")
			
 
				-        extra = ""
			
 
				-        if source:
			
 
				-            source = remove_surrogates(source)
			
 
				-            extra = " -> %s" % source
			
 
				+        item_data.update(text_to_json("source", source))
			
 
				+        item_data.update(text_to_json("linktarget", source))
			
 
				+        if not self.json_lines:
			
 
				+            item_data["extra"] = "" if not source else f" -> {item_data['source']}"
			
 
				+
			
 
				         hlid = item.get("hlid")
			
 
				         hlid = bin_to_hex(hlid) if hlid else ""
			
 
				+        item_data["hlid"] = hlid
			
 
				+
			
 
				+        mode = stat.filemode(item.mode)
			
 
				+        item_type = mode[0]
			
 
				         item_data["type"] = item_type
			
 
				         item_data["mode"] = mode
			
 
				-        item_data["user"] = item.get("user", str(item.uid))
			
 
				-        item_data["group"] = item.get("group", str(item.gid))
			
 
				+
			
 
				+        item_data.update(text_to_json("user", item.get("user", str(item.uid))))
			
 
				+        item_data.update(text_to_json("group", item.get("group", str(item.gid))))
			
 
				         item_data["uid"] = item.uid
			
 
				         item_data["gid"] = item.gid
			
 
				-        item_data["path"] = remove_surrogates(item.path)
			
 
				+
			
 
				         if self.json_lines:
			
 
				             item_data["healthy"] = "chunks_healthy" not in item
			
 
				         else:
			
 
				-            item_data["extra"] = extra
			
 
				             item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
			
 
				-        item_data["source"] = source
			
 
				-        item_data["linktarget"] = source
			
 
				-        item_data["hlid"] = hlid
			
 
				         item_data["flags"] = item.get("bsdflags")  # int if flags known, else (if flags unknown) None
			
 
				         for key in self.used_call_keys:
			
 
				             item_data[key] = self.call_keys[key](item)
			
--- a/src/borg/testsuite/helpers.py
+++ b/src/borg/testsuite/helpers.py
@@ -1,3 +1,4 @@
 
				+import base64
			
 
				 import errno
			
 
				 import getpass
			
 
				 import hashlib
			
@@ -42,6 +43,7 @@ from ..helpers import dash_open
 
				 from ..helpers import iter_separated
			
 
				 from ..helpers import eval_escapes
			
 
				 from ..helpers import safe_unlink
			
 
				+from ..helpers import text_to_json, binary_to_json
			
 
				 from ..helpers.passphrase import Passphrase, PasswordRetriesExceeded
			
 
				 from ..platform import is_cygwin
			
 
				 
			
@@ -53,6 +55,46 @@ def test_bin_to_hex():
 
				     assert bin_to_hex(b"\x00\x01\xff") == "0001ff"
			
 
				 
			
 
				 
			
 
				+@pytest.mark.parametrize(
			
 
				+    "key,value",
			
 
				+    [("key", b"\x00\x01\x02\x03"), ("key", b"\x00\x01\x02"), ("key", b"\x00\x01"), ("key", b"\x00"), ("key", b"")],
			
 
				+)
			
 
				+def test_binary_to_json(key, value):
			
 
				+    key_b64 = key + "_b64"
			
 
				+    d = binary_to_json(key, value)
			
 
				+    assert key_b64 in d
			
 
				+    assert base64.b64decode(d[key_b64]) == value
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize(
			
 
				+    "key,value,strict",
			
 
				+    [
			
 
				+        ("key", "abc", True),
			
 
				+        ("key", "äöü", True),
			
 
				+        ("key", "", True),
			
 
				+        ("key", b"\x00\xff".decode("utf-8", errors="surrogateescape"), False),
			
 
				+        ("key", "äöü".encode("latin1").decode("utf-8", errors="surrogateescape"), False),
			
 
				+    ],
			
 
				+)
			
 
				+def test_text_to_json(key, value, strict):
			
 
				+    key_b64 = key + "_b64"
			
 
				+    d = text_to_json(key, value)
			
 
				+    value_b = value.encode("utf-8", errors="surrogateescape")
			
 
				+    if strict:
			
 
				+        # no surrogate-escapes, just unicode text
			
 
				+        assert key in d
			
 
				+        assert d[key] == value_b.decode("utf-8", errors="strict")
			
 
				+        assert d[key].encode("utf-8", errors="strict") == value_b
			
 
				+        assert key_b64 not in d  # not needed. pure valid unicode.
			
 
				+    else:
			
 
				+        # requiring surrogate-escapes. text has replacement chars, base64 representation is present.
			
 
				+        assert key in d
			
 
				+        assert d[key] == value.encode("utf-8", errors="replace").decode("utf-8", errors="strict")
			
 
				+        assert d[key].encode("utf-8", errors="strict") == value.encode("utf-8", errors="replace")
			
 
				+        assert key_b64 in d
			
 
				+        assert base64.b64decode(d[key_b64]) == value_b
			
 
				+
			
 
				+
			
 
				 class TestLocationWithoutEnv:
			
 
				     @pytest.fixture
			
 
				     def keys_dir(self, tmpdir, monkeypatch):