|
@@ -1,4 +1,5 @@
|
|
|
import argparse
|
|
|
+import base64
|
|
|
import hashlib
|
|
|
import json
|
|
|
import os
|
|
@@ -50,6 +51,41 @@ def remove_surrogates(s, errors="replace"):
|
|
|
return s.encode("utf-8", errors).decode("utf-8")
|
|
|
|
|
|
|
|
|
+def binary_to_json(key, value):
|
|
|
+ assert isinstance(key, str)
|
|
|
+ assert isinstance(value, bytes)
|
|
|
+ return {key + "_b64": base64.b64encode(value).decode("ascii")}
|
|
|
+
|
|
|
+
|
|
|
+def text_to_json(key, value):
|
|
|
+ """
|
|
|
+ Return a dict made from key/value that can be fed safely into a JSON encoder.
|
|
|
+
|
|
|
+ JSON can only contain pure, valid unicode (but not: unicode with surrogate escapes).
|
|
|
+
|
|
|
+ But sometimes we have to deal with such values and we do it like this:
|
|
|
+ - <key>: value as pure unicode text (surrogate escapes, if any, replaced by ?)
|
|
|
+ - <key>_b64: value as base64 encoded binary representation (only set if value has surrogate-escapes)
|
|
|
+ """
|
|
|
+ coding = "utf-8"
|
|
|
+ assert isinstance(key, str)
|
|
|
+ assert isinstance(value, str) # str might contain surrogate escapes
|
|
|
+ data = {}
|
|
|
+ try:
|
|
|
+ value.encode(coding, errors="strict") # check if pure unicode
|
|
|
+ except UnicodeEncodeError:
|
|
|
+ # value has surrogate escape sequences
|
|
|
+ value_replace_encoded = value.encode(coding, errors="replace")
|
|
|
+ data[key] = value_replace_encoded.decode(coding, errors="strict")
|
|
|
+ value_bytes = value.encode(coding, errors="surrogateescape")
|
|
|
+ data.update(binary_to_json(key, value_bytes))
|
|
|
+ else:
|
|
|
+ # value is pure unicode
|
|
|
+ data[key] = value
|
|
|
+ # we do not give the b64 representation, not needed
|
|
|
+ return data
|
|
|
+
|
|
|
+
|
|
|
def eval_escapes(s):
|
|
|
"""Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
|
|
|
return s.encode("ascii", "backslashreplace").decode("unicode-escape")
|
|
@@ -681,7 +717,7 @@ class ArchiveFormatter(BaseFormatter):
|
|
|
self.call_keys = {
|
|
|
"hostname": partial(self.get_meta, "hostname", rs=True),
|
|
|
"username": partial(self.get_meta, "username", rs=True),
|
|
|
- "comment": partial(self.get_meta, "comment", rs=True),
|
|
|
+ "comment": partial(self.get_meta, "comment", rs=False),
|
|
|
"end": self.get_ts_end,
|
|
|
"command_line": self.get_cmdline,
|
|
|
}
|
|
@@ -702,8 +738,8 @@ class ArchiveFormatter(BaseFormatter):
|
|
|
item_data.update(self.item_data)
|
|
|
item_data.update(
|
|
|
{
|
|
|
- "name": remove_surrogates(archive_info.name),
|
|
|
- "archive": remove_surrogates(archive_info.name),
|
|
|
+ "name": archive_info.name,
|
|
|
+ "archive": archive_info.name,
|
|
|
"id": bin_to_hex(archive_info.id),
|
|
|
"time": self.format_time(archive_info.ts),
|
|
|
"start": self.format_time(archive_info.ts),
|
|
@@ -840,31 +876,32 @@ class ItemFormatter(BaseFormatter):
|
|
|
def get_item_data(self, item):
|
|
|
item_data = {}
|
|
|
item_data.update(self.item_data)
|
|
|
- mode = stat.filemode(item.mode)
|
|
|
- item_type = mode[0]
|
|
|
|
|
|
+ item_data.update(text_to_json("path", item.path))
|
|
|
source = item.get("source", "")
|
|
|
- extra = ""
|
|
|
- if source:
|
|
|
- source = remove_surrogates(source)
|
|
|
- extra = " -> %s" % source
|
|
|
+ item_data.update(text_to_json("source", source))
|
|
|
+ item_data.update(text_to_json("linktarget", source))
|
|
|
+ if not self.json_lines:
|
|
|
+ item_data["extra"] = "" if not source else f" -> {item_data['source']}"
|
|
|
+
|
|
|
hlid = item.get("hlid")
|
|
|
hlid = bin_to_hex(hlid) if hlid else ""
|
|
|
+ item_data["hlid"] = hlid
|
|
|
+
|
|
|
+ mode = stat.filemode(item.mode)
|
|
|
+ item_type = mode[0]
|
|
|
item_data["type"] = item_type
|
|
|
item_data["mode"] = mode
|
|
|
- item_data["user"] = item.get("user", str(item.uid))
|
|
|
- item_data["group"] = item.get("group", str(item.gid))
|
|
|
+
|
|
|
+ item_data.update(text_to_json("user", item.get("user", str(item.uid))))
|
|
|
+ item_data.update(text_to_json("group", item.get("group", str(item.gid))))
|
|
|
item_data["uid"] = item.uid
|
|
|
item_data["gid"] = item.gid
|
|
|
- item_data["path"] = remove_surrogates(item.path)
|
|
|
+
|
|
|
if self.json_lines:
|
|
|
item_data["healthy"] = "chunks_healthy" not in item
|
|
|
else:
|
|
|
- item_data["extra"] = extra
|
|
|
item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
|
|
|
- item_data["source"] = source
|
|
|
- item_data["linktarget"] = source
|
|
|
- item_data["hlid"] = hlid
|
|
|
item_data["flags"] = item.get("bsdflags") # int if flags known, else (if flags unknown) None
|
|
|
for key in self.used_call_keys:
|
|
|
item_data[key] = self.call_keys[key](item)
|