parseformat.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046
  1. import argparse
  2. import hashlib
  3. import json
  4. import os
  5. import os.path
  6. import re
  7. import shlex
  8. import socket
  9. import stat
  10. import uuid
  11. from binascii import hexlify
  12. from collections import Counter, OrderedDict
  13. from datetime import datetime, timezone
  14. from functools import partial
  15. from string import Formatter
  16. from ..logger import create_logger
  17. logger = create_logger()
  18. from .errors import Error
  19. from .fs import get_keys_dir
  20. from .msgpack import Timestamp
  21. from .time import OutputTimestamp, format_time, to_localtime, safe_timestamp, safe_s
  22. from .. import __version__ as borg_version
  23. from .. import __version_tuple__ as borg_version_tuple
  24. from ..constants import * # NOQA
  25. from ..platformflags import is_win32
  26. def bin_to_hex(binary):
  27. return hexlify(binary).decode('ascii')
  28. def safe_decode(s, coding='utf-8', errors='surrogateescape'):
  29. """decode bytes to str, with round-tripping "invalid" bytes"""
  30. if s is None:
  31. return None
  32. return s.decode(coding, errors)
  33. def safe_encode(s, coding='utf-8', errors='surrogateescape'):
  34. """encode str to bytes, with round-tripping "invalid" bytes"""
  35. if s is None:
  36. return None
  37. return s.encode(coding, errors)
  38. def remove_surrogates(s, errors='replace'):
  39. """Replace surrogates generated by fsdecode with '?'"""
  40. return s.encode('utf-8', errors).decode('utf-8')
  41. def eval_escapes(s):
  42. """Evaluate literal escape sequences in a string (eg `\\n` -> `\n`)."""
  43. return s.encode('ascii', 'backslashreplace').decode('unicode-escape')
  44. def decode_dict(d, keys, encoding='utf-8', errors='surrogateescape'):
  45. for key in keys:
  46. if isinstance(d.get(key), bytes):
  47. d[key] = d[key].decode(encoding, errors)
  48. return d
  49. def positive_int_validator(value):
  50. """argparse type for positive integers"""
  51. int_value = int(value)
  52. if int_value <= 0:
  53. raise argparse.ArgumentTypeError('A positive integer is required: %s' % value)
  54. return int_value
  55. def interval(s):
  56. """Convert a string representing a valid interval to a number of hours."""
  57. multiplier = {'H': 1, 'd': 24, 'w': 24 * 7, 'm': 24 * 31, 'y': 24 * 365}
  58. if s.endswith(tuple(multiplier.keys())):
  59. number = s[:-1]
  60. suffix = s[-1]
  61. else:
  62. # range suffixes in ascending multiplier order
  63. ranges = [k for k, v in sorted(multiplier.items(), key=lambda t: t[1])]
  64. raise argparse.ArgumentTypeError(
  65. f'Unexpected interval time unit "{s[-1]}": expected one of {ranges!r}')
  66. try:
  67. hours = int(number) * multiplier[suffix]
  68. except ValueError:
  69. hours = -1
  70. if hours <= 0:
  71. raise argparse.ArgumentTypeError(
  72. 'Unexpected interval number "%s": expected an integer greater than 0' % number)
  73. return hours
  74. def ChunkerParams(s):
  75. params = s.strip().split(',')
  76. count = len(params)
  77. if count == 0:
  78. raise ValueError('no chunker params given')
  79. algo = params[0].lower()
  80. if algo == CH_FIXED and 2 <= count <= 3: # fixed, block_size[, header_size]
  81. block_size = int(params[1])
  82. header_size = int(params[2]) if count == 3 else 0
  83. if block_size < 64:
  84. # we are only disallowing the most extreme cases of abuse here - this does NOT imply
  85. # that cutting chunks of the minimum allowed size is efficient concerning storage
  86. # or in-memory chunk management.
  87. # choose the block (chunk) size wisely: if you have a lot of data and you cut
  88. # it into very small chunks, you are asking for trouble!
  89. raise ValueError('block_size must not be less than 64 Bytes')
  90. if block_size > MAX_DATA_SIZE or header_size > MAX_DATA_SIZE:
  91. raise ValueError('block_size and header_size must not exceed MAX_DATA_SIZE [%d]' % MAX_DATA_SIZE)
  92. return algo, block_size, header_size
  93. if algo == 'default' and count == 1: # default
  94. return CHUNKER_PARAMS
  95. # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
  96. if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
  97. chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4:])
  98. if not (chunk_min <= chunk_mask <= chunk_max):
  99. raise ValueError('required: chunk_min <= chunk_mask <= chunk_max')
  100. if chunk_min < 6:
  101. # see comment in 'fixed' algo check
  102. raise ValueError('min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)')
  103. if chunk_max > 23:
  104. raise ValueError('max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)')
  105. return CH_BUZHASH, chunk_min, chunk_max, chunk_mask, window_size
  106. raise ValueError('invalid chunker params')
  107. def FilesCacheMode(s):
  108. ENTRIES_MAP = dict(ctime='c', mtime='m', size='s', inode='i', rechunk='r', disabled='d')
  109. VALID_MODES = ('cis', 'ims', 'cs', 'ms', 'cr', 'mr', 'd', 's') # letters in alpha order
  110. entries = set(s.strip().split(','))
  111. if not entries <= set(ENTRIES_MAP):
  112. raise ValueError('cache mode must be a comma-separated list of: %s' % ','.join(sorted(ENTRIES_MAP)))
  113. short_entries = {ENTRIES_MAP[entry] for entry in entries}
  114. mode = ''.join(sorted(short_entries))
  115. if mode not in VALID_MODES:
  116. raise ValueError('cache mode short must be one of: %s' % ','.join(VALID_MODES))
  117. return mode
  118. def partial_format(format, mapping):
  119. """
  120. Apply format.format_map(mapping) while preserving unknown keys
  121. Does not support attribute access, indexing and ![rsa] conversions
  122. """
  123. for key, value in mapping.items():
  124. key = re.escape(key)
  125. format = re.sub(fr'(?<!\{{)((\{{{key}\}})|(\{{{key}:[^\}}]*\}}))',
  126. lambda match: match.group(1).format_map(mapping),
  127. format)
  128. return format
  129. class DatetimeWrapper:
  130. def __init__(self, dt):
  131. self.dt = dt
  132. def __format__(self, format_spec):
  133. if format_spec == '':
  134. format_spec = ISO_FORMAT_NO_USECS
  135. return self.dt.__format__(format_spec)
  136. class PlaceholderError(Error):
  137. """Formatting Error: "{}".format({}): {}({})"""
  138. class InvalidPlaceholder(PlaceholderError):
  139. """Invalid placeholder "{}" in string: {}"""
  140. def format_line(format, data):
  141. for _, key, _, conversion in Formatter().parse(format):
  142. if not key:
  143. continue
  144. if conversion or key not in data:
  145. raise InvalidPlaceholder(key, format)
  146. try:
  147. return format.format_map(data)
  148. except Exception as e:
  149. raise PlaceholderError(format, data, e.__class__.__name__, str(e))
  150. def replace_placeholders(text, overrides={}):
  151. """Replace placeholders in text with their values."""
  152. from ..platform import fqdn, hostname, getosusername
  153. current_time = datetime.now(timezone.utc)
  154. data = {
  155. 'pid': os.getpid(),
  156. 'fqdn': fqdn,
  157. 'reverse-fqdn': '.'.join(reversed(fqdn.split('.'))),
  158. 'hostname': hostname,
  159. 'now': DatetimeWrapper(current_time.astimezone(None)),
  160. 'utcnow': DatetimeWrapper(current_time),
  161. 'user': getosusername(),
  162. 'uuid4': str(uuid.uuid4()),
  163. 'borgversion': borg_version,
  164. 'borgmajor': '%d' % borg_version_tuple[:1],
  165. 'borgminor': '%d.%d' % borg_version_tuple[:2],
  166. 'borgpatch': '%d.%d.%d' % borg_version_tuple[:3],
  167. **overrides,
  168. }
  169. return format_line(text, data)
  170. PrefixSpec = replace_placeholders
  171. GlobSpec = replace_placeholders
  172. CommentSpec = replace_placeholders
  173. def SortBySpec(text):
  174. from .manifest import AI_HUMAN_SORT_KEYS
  175. for token in text.split(','):
  176. if token not in AI_HUMAN_SORT_KEYS:
  177. raise ValueError('Invalid sort key: %s' % token)
  178. return text.replace('timestamp', 'ts')
  179. def format_file_size(v, precision=2, sign=False, iec=False):
  180. """Format file size into a human friendly format
  181. """
  182. fn = sizeof_fmt_iec if iec else sizeof_fmt_decimal
  183. return fn(v, suffix='B', sep=' ', precision=precision, sign=sign)
  184. class FileSize(int):
  185. def __new__(cls, value, iec=False):
  186. obj = int.__new__(cls, value)
  187. obj.iec = iec
  188. return obj
  189. def __format__(self, format_spec):
  190. return format_file_size(int(self), iec=self.iec).__format__(format_spec)
  191. def parse_file_size(s):
  192. """Return int from file size (1234, 55G, 1.7T)."""
  193. if not s:
  194. return int(s) # will raise
  195. suffix = s[-1]
  196. power = 1000
  197. try:
  198. factor = {
  199. 'K': power,
  200. 'M': power**2,
  201. 'G': power**3,
  202. 'T': power**4,
  203. 'P': power**5,
  204. }[suffix]
  205. s = s[:-1]
  206. except KeyError:
  207. factor = 1
  208. return int(float(s) * factor)
  209. def sizeof_fmt(num, suffix='B', units=None, power=None, sep='', precision=2, sign=False):
  210. sign = '+' if sign and num > 0 else ''
  211. fmt = '{0:{1}.{2}f}{3}{4}{5}'
  212. prec = 0
  213. for unit in units[:-1]:
  214. if abs(round(num, precision)) < power:
  215. break
  216. num /= float(power)
  217. prec = precision
  218. else:
  219. unit = units[-1]
  220. return fmt.format(num, sign, prec, sep, unit, suffix)
  221. def sizeof_fmt_iec(num, suffix='B', sep='', precision=2, sign=False):
  222. return sizeof_fmt(num, suffix=suffix, sep=sep, precision=precision, sign=sign,
  223. units=['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'], power=1024)
  224. def sizeof_fmt_decimal(num, suffix='B', sep='', precision=2, sign=False):
  225. return sizeof_fmt(num, suffix=suffix, sep=sep, precision=precision, sign=sign,
  226. units=['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'], power=1000)
  227. def format_archive(archive):
  228. return '%-36s %s [%s]' % (
  229. archive.name,
  230. format_time(to_localtime(archive.ts)),
  231. bin_to_hex(archive.id),
  232. )
  233. def parse_stringified_list(s):
  234. l = re.split(" *, *", s)
  235. return [item for item in l if item != '']
  236. class Location:
  237. """Object representing a repository / archive location
  238. """
  239. proto = user = _host = port = path = archive = None
  240. # user must not contain "@", ":" or "/".
  241. # Quoting adduser error message:
  242. # "To avoid problems, the username should consist only of letters, digits,
  243. # underscores, periods, at signs and dashes, and not start with a dash
  244. # (as defined by IEEE Std 1003.1-2001)."
  245. # We use "@" as separator between username and hostname, so we must
  246. # disallow it within the pure username part.
  247. optional_user_re = r"""
  248. (?:(?P<user>[^@:/]+)@)?
  249. """
  250. # path must not contain :: (it ends at :: or string end), but may contain single colons.
  251. # to avoid ambiguities with other regexes, it must also not start with ":" nor with "//" nor with "ssh://".
  252. local_path_re = r"""
  253. (?!(:|//|ssh://)) # not starting with ":" or // or ssh://
  254. (?P<path>([^:]|(:(?!:)))+) # any chars, but no "::"
  255. """
  256. # file_path must not contain :: (it ends at :: or string end), but may contain single colons.
  257. # it must start with a / and that slash is part of the path.
  258. file_path_re = r"""
  259. (?P<path>(([^/]*)/([^:]|(:(?!:)))+)) # start opt. servername, then /, then any chars, but no "::"
  260. """
  261. # abs_path must not contain :: (it ends at :: or string end), but may contain single colons.
  262. # it must start with a / and that slash is part of the path.
  263. abs_path_re = r"""
  264. (?P<path>(/([^:]|(:(?!:)))+)) # start with /, then any chars, but no "::"
  265. """
  266. # optional ::archive_name at the end, archive name must not contain "/".
  267. # borg mount's FUSE filesystem creates one level of directories from
  268. # the archive names and of course "/" is not valid in a directory name.
  269. optional_archive_re = r"""
  270. (?:
  271. :: # "::" as separator
  272. (?P<archive>[^/]+) # archive name must not contain "/"
  273. )?$""" # must match until the end
  274. # host NAME, or host IP ADDRESS (v4 or v6, v6 must be in square brackets)
  275. host_re = r"""
  276. (?P<host>(
  277. (?!\[)[^:/]+(?<!\]) # hostname or v4 addr, not containing : or / (does not match v6 addr: no brackets!)
  278. |
  279. \[[0-9a-fA-F:.]+\]) # ipv6 address in brackets
  280. )
  281. """
  282. # regexes for misc. kinds of supported location specifiers:
  283. ssh_re = re.compile(r"""
  284. (?P<proto>ssh):// # ssh://
  285. """ + optional_user_re + host_re + r""" # user@ (optional), host name or address
  286. (?::(?P<port>\d+))? # :port (optional)
  287. """ + abs_path_re + optional_archive_re, re.VERBOSE) # path or path::archive
  288. file_re = re.compile(r"""
  289. (?P<proto>file):// # file://
  290. """ + file_path_re + optional_archive_re, re.VERBOSE) # servername/path, path or path::archive
  291. local_re = re.compile(
  292. local_path_re + optional_archive_re, re.VERBOSE) # local path with optional archive
  293. # get the repo from BORG_REPO env and the optional archive from param.
  294. # if the syntax requires giving REPOSITORY (see "borg mount"),
  295. # use "::" to let it use the env var.
  296. # if REPOSITORY argument is optional, it'll automatically use the env.
  297. env_re = re.compile(r""" # the repo part is fetched from BORG_REPO
  298. (?:::$) # just "::" is ok (when a pos. arg is required, no archive)
  299. | # or
  300. """ + optional_archive_re, re.VERBOSE) # archive name (optional, may be empty)
  301. win_file_re = re.compile(r"""
  302. (?:file://)? # optional file protocol
  303. (?P<path>
  304. (?:[a-zA-Z]:)? # Drive letter followed by a colon (optional)
  305. (?:[^:]+) # Anything which does not contain a :, at least one character
  306. )
  307. """ + optional_archive_re, re.VERBOSE) # archive name (optional, may be empty)
  308. def __init__(self, text='', overrides={}, other=False):
  309. self.repo_env_var = 'BORG_OTHER_REPO' if other else 'BORG_REPO'
  310. if not self.parse(text, overrides):
  311. raise ValueError('Invalid location format: "%s"' % self.processed)
  312. def parse(self, text, overrides={}):
  313. self.raw = text # as given by user, might contain placeholders
  314. self.processed = text = replace_placeholders(text, overrides) # after placeholder replacement
  315. valid = self._parse(text)
  316. if valid:
  317. return True
  318. m = self.env_re.match(text)
  319. if not m:
  320. return False
  321. repo_raw = os.environ.get(self.repo_env_var)
  322. if repo_raw is None:
  323. return False
  324. repo = replace_placeholders(repo_raw, overrides)
  325. valid = self._parse(repo)
  326. self.archive = m.group('archive')
  327. self.raw = repo_raw if not self.archive else repo_raw + self.raw
  328. self.processed = repo if not self.archive else f'{repo}::{self.archive}'
  329. return valid
  330. def _parse(self, text):
  331. def normpath_special(p):
  332. # avoid that normpath strips away our relative path hack and even makes p absolute
  333. relative = p.startswith('/./')
  334. p = os.path.normpath(p)
  335. return ('/.' + p) if relative else p
  336. if is_win32:
  337. m = self.win_file_re.match(text)
  338. if m:
  339. self.proto = 'file'
  340. self.path = m.group('path')
  341. self.archive = m.group('archive')
  342. return True
  343. # On windows we currently only support windows paths
  344. return False
  345. m = self.ssh_re.match(text)
  346. if m:
  347. self.proto = m.group('proto')
  348. self.user = m.group('user')
  349. self._host = m.group('host')
  350. self.port = m.group('port') and int(m.group('port')) or None
  351. self.path = normpath_special(m.group('path'))
  352. self.archive = m.group('archive')
  353. return True
  354. m = self.file_re.match(text)
  355. if m:
  356. self.proto = m.group('proto')
  357. self.path = normpath_special(m.group('path'))
  358. self.archive = m.group('archive')
  359. return True
  360. m = self.local_re.match(text)
  361. if m:
  362. self.path = normpath_special(m.group('path'))
  363. self.archive = m.group('archive')
  364. self.proto = 'file'
  365. return True
  366. return False
  367. def __str__(self):
  368. items = [
  369. 'proto=%r' % self.proto,
  370. 'user=%r' % self.user,
  371. 'host=%r' % self.host,
  372. 'port=%r' % self.port,
  373. 'path=%r' % self.path,
  374. 'archive=%r' % self.archive,
  375. ]
  376. return ', '.join(items)
  377. def to_key_filename(self):
  378. name = re.sub(r'[^\w]', '_', self.path).strip('_')
  379. if self.proto != 'file':
  380. name = re.sub(r'[^\w]', '_', self.host) + '__' + name
  381. if len(name) > 100:
  382. # Limit file names to some reasonable length. Most file systems
  383. # limit them to 255 [unit of choice]; due to variations in unicode
  384. # handling we truncate to 100 *characters*.
  385. name = name[:100]
  386. return os.path.join(get_keys_dir(), name)
  387. def __repr__(self):
  388. return "Location(%s)" % self
  389. @property
  390. def host(self):
  391. # strip square brackets used for IPv6 addrs
  392. if self._host is not None:
  393. return self._host.lstrip('[').rstrip(']')
  394. def canonical_path(self):
  395. if self.proto == 'file':
  396. return self.path
  397. else:
  398. if self.path and self.path.startswith('~'):
  399. path = '/' + self.path # /~/x = path x relative to home dir
  400. elif self.path and not self.path.startswith('/'):
  401. path = '/./' + self.path # /./x = path x relative to cwd
  402. else:
  403. path = self.path
  404. return 'ssh://{}{}{}{}'.format(f'{self.user}@' if self.user else '',
  405. self._host, # needed for ipv6 addrs
  406. f':{self.port}' if self.port else '',
  407. path)
  408. def with_timestamp(self, timestamp):
  409. return Location(self.raw, overrides={
  410. 'now': DatetimeWrapper(timestamp.astimezone(None)),
  411. 'utcnow': DatetimeWrapper(timestamp),
  412. })
  413. def omit_archive(self):
  414. loc = Location(self.raw)
  415. loc.archive = None
  416. loc.raw = loc.raw.split("::")[0]
  417. loc.processed = loc.processed.split("::")[0]
  418. return loc
  419. def location_validator(archive=None, proto=None, other=False):
  420. def validator(text):
  421. try:
  422. loc = Location(text, other=other)
  423. except ValueError as err:
  424. raise argparse.ArgumentTypeError(str(err)) from None
  425. if archive is True and not loc.archive:
  426. raise argparse.ArgumentTypeError('"%s": No archive specified' % text)
  427. elif archive is False and loc.archive:
  428. raise argparse.ArgumentTypeError('"%s": No archive can be specified' % text)
  429. if proto is not None and loc.proto != proto:
  430. if proto == 'file':
  431. raise argparse.ArgumentTypeError('"%s": Repository must be local' % text)
  432. else:
  433. raise argparse.ArgumentTypeError('"%s": Repository must be remote' % text)
  434. return loc
  435. return validator
  436. def archivename_validator():
  437. def validator(text):
  438. text = replace_placeholders(text)
  439. if '/' in text or '::' in text or not text:
  440. raise argparse.ArgumentTypeError('Invalid archive name: "%s"' % text)
  441. return text
  442. return validator
  443. class BaseFormatter:
  444. FIXED_KEYS = {
  445. # Formatting aids
  446. 'LF': '\n',
  447. 'SPACE': ' ',
  448. 'TAB': '\t',
  449. 'CR': '\r',
  450. 'NUL': '\0',
  451. 'NEWLINE': os.linesep,
  452. 'NL': os.linesep,
  453. }
  454. def get_item_data(self, item):
  455. raise NotImplementedError
  456. def format_item(self, item):
  457. return self.format.format_map(self.get_item_data(item))
  458. @staticmethod
  459. def keys_help():
  460. return "- NEWLINE: OS dependent line separator\n" \
  461. "- NL: alias of NEWLINE\n" \
  462. "- NUL: NUL character for creating print0 / xargs -0 like output, see barchive and bpath keys below\n" \
  463. "- SPACE\n" \
  464. "- TAB\n" \
  465. "- CR\n" \
  466. "- LF"
  467. class ArchiveFormatter(BaseFormatter):
  468. KEY_DESCRIPTIONS = {
  469. 'archive': 'archive name interpreted as text (might be missing non-text characters, see barchive)',
  470. 'name': 'alias of "archive"',
  471. 'barchive': 'verbatim archive name, can contain any character except NUL',
  472. 'comment': 'archive comment interpreted as text (might be missing non-text characters, see bcomment)',
  473. 'bcomment': 'verbatim archive comment, can contain any character except NUL',
  474. # *start* is the key used by borg-info for this timestamp, this makes the formats more compatible
  475. 'start': 'time (start) of creation of the archive',
  476. 'time': 'alias of "start"',
  477. 'end': 'time (end) of creation of the archive',
  478. 'command_line': 'command line which was used to create the archive',
  479. 'id': 'internal ID of the archive',
  480. 'hostname': 'hostname of host on which this archive was created',
  481. 'username': 'username of user who created this archive',
  482. }
  483. KEY_GROUPS = (
  484. ('archive', 'name', 'barchive', 'comment', 'bcomment', 'id'),
  485. ('start', 'time', 'end', 'command_line'),
  486. ('hostname', 'username'),
  487. )
  488. @classmethod
  489. def available_keys(cls):
  490. from .manifest import ArchiveInfo
  491. fake_archive_info = ArchiveInfo('archivename', b'\1'*32, datetime(1970, 1, 1, tzinfo=timezone.utc))
  492. formatter = cls('', None, None, None)
  493. keys = []
  494. keys.extend(formatter.call_keys.keys())
  495. keys.extend(formatter.get_item_data(fake_archive_info).keys())
  496. return keys
  497. @classmethod
  498. def keys_help(cls):
  499. help = []
  500. keys = cls.available_keys()
  501. for key in cls.FIXED_KEYS:
  502. keys.remove(key)
  503. for group in cls.KEY_GROUPS:
  504. for key in group:
  505. keys.remove(key)
  506. text = "- " + key
  507. if key in cls.KEY_DESCRIPTIONS:
  508. text += ": " + cls.KEY_DESCRIPTIONS[key]
  509. help.append(text)
  510. help.append("")
  511. assert not keys, str(keys)
  512. return "\n".join(help)
  513. def __init__(self, format, repository, manifest, key, *, json=False, iec=False):
  514. self.repository = repository
  515. self.manifest = manifest
  516. self.key = key
  517. self.name = None
  518. self.id = None
  519. self._archive = None
  520. self.json = json
  521. self.iec = iec
  522. static_keys = {} # here could be stuff on repo level, above archive level
  523. static_keys.update(self.FIXED_KEYS)
  524. self.format = partial_format(format, static_keys)
  525. self.format_keys = {f[1] for f in Formatter().parse(format)}
  526. self.call_keys = {
  527. 'hostname': partial(self.get_meta, 'hostname', rs=True),
  528. 'username': partial(self.get_meta, 'username', rs=True),
  529. 'comment': partial(self.get_meta, 'comment', rs=True),
  530. 'bcomment': partial(self.get_meta, 'comment', rs=False),
  531. 'end': self.get_ts_end,
  532. 'command_line': self.get_cmdline,
  533. }
  534. self.used_call_keys = set(self.call_keys) & self.format_keys
  535. if self.json:
  536. self.item_data = {}
  537. self.format_item = self.format_item_json
  538. else:
  539. self.item_data = static_keys
  540. def format_item_json(self, item):
  541. return json.dumps(self.get_item_data(item), cls=BorgJsonEncoder) + '\n'
  542. def get_item_data(self, archive_info):
  543. self.name = archive_info.name
  544. self.id = archive_info.id
  545. item_data = {}
  546. item_data.update(self.item_data)
  547. item_data.update({
  548. 'name': remove_surrogates(archive_info.name),
  549. 'archive': remove_surrogates(archive_info.name),
  550. 'barchive': archive_info.name,
  551. 'id': bin_to_hex(archive_info.id),
  552. 'time': self.format_time(archive_info.ts),
  553. 'start': self.format_time(archive_info.ts),
  554. })
  555. for key in self.used_call_keys:
  556. item_data[key] = self.call_keys[key]()
  557. return item_data
  558. @property
  559. def archive(self):
  560. """lazy load / update loaded archive"""
  561. if self._archive is None or self._archive.id != self.id:
  562. from ..archive import Archive
  563. self._archive = Archive(self.repository, self.key, self.manifest, self.name, iec=self.iec)
  564. return self._archive
  565. def get_meta(self, key, rs):
  566. value = self.archive.metadata.get(key, '')
  567. return remove_surrogates(value) if rs else value
  568. def get_cmdline(self):
  569. cmdline = map(remove_surrogates, self.archive.metadata.get('cmdline', []))
  570. if self.json:
  571. return list(cmdline)
  572. else:
  573. return ' '.join(map(shlex.quote, cmdline))
  574. def get_ts_end(self):
  575. return self.format_time(self.archive.ts_end)
  576. def format_time(self, ts):
  577. return OutputTimestamp(ts)
  578. class ItemFormatter(BaseFormatter):
  579. # we provide the hash algos from python stdlib (except shake_*) and additionally xxh64.
  580. # shake_* is not provided because it uses an incompatible .digest() method to support variable length.
  581. hash_algorithms = hashlib.algorithms_guaranteed.union({'xxh64'}).difference({'shake_128', 'shake_256'})
  582. KEY_DESCRIPTIONS = {
  583. 'bpath': 'verbatim POSIX path, can contain any character except NUL',
  584. 'path': 'path interpreted as text (might be missing non-text characters, see bpath)',
  585. 'source': 'link target for symlinks (identical to linktarget)',
  586. 'hlid': 'hard link identity (same if hardlinking same fs object)',
  587. 'extra': 'prepends {source} with " -> " for soft links and " link to " for hard links',
  588. 'dsize': 'deduplicated size',
  589. 'num_chunks': 'number of chunks in this file',
  590. 'unique_chunks': 'number of unique chunks in this file',
  591. 'xxh64': 'XXH64 checksum of this file (note: this is NOT a cryptographic hash!)',
  592. 'health': 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)',
  593. }
  594. KEY_GROUPS = (
  595. ('type', 'mode', 'uid', 'gid', 'user', 'group', 'path', 'bpath', 'source', 'linktarget', 'hlid', 'flags'),
  596. ('size', 'dsize', 'num_chunks', 'unique_chunks'),
  597. ('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'),
  598. tuple(sorted(hash_algorithms)),
  599. ('archiveid', 'archivename', 'extra'),
  600. ('health', )
  601. )
  602. KEYS_REQUIRING_CACHE = (
  603. 'dsize', 'unique_chunks',
  604. )
  605. @classmethod
  606. def available_keys(cls):
  607. class FakeArchive:
  608. fpr = name = ""
  609. from ..item import Item
  610. fake_item = Item(mode=0, path='', user='', group='', mtime=0, uid=0, gid=0)
  611. formatter = cls(FakeArchive, "")
  612. keys = []
  613. keys.extend(formatter.call_keys.keys())
  614. keys.extend(formatter.get_item_data(fake_item).keys())
  615. return keys
  616. @classmethod
  617. def keys_help(cls):
  618. help = []
  619. keys = cls.available_keys()
  620. for key in cls.FIXED_KEYS:
  621. keys.remove(key)
  622. for group in cls.KEY_GROUPS:
  623. for key in group:
  624. keys.remove(key)
  625. text = "- " + key
  626. if key in cls.KEY_DESCRIPTIONS:
  627. text += ": " + cls.KEY_DESCRIPTIONS[key]
  628. help.append(text)
  629. help.append("")
  630. assert not keys, str(keys)
  631. return "\n".join(help)
  632. @classmethod
  633. def format_needs_cache(cls, format):
  634. format_keys = {f[1] for f in Formatter().parse(format)}
  635. return any(key in cls.KEYS_REQUIRING_CACHE for key in format_keys)
  636. def __init__(self, archive, format, *, json_lines=False):
  637. from ..checksums import StreamingXXH64
  638. self.xxh64 = StreamingXXH64
  639. self.archive = archive
  640. self.json_lines = json_lines
  641. static_keys = {
  642. 'archivename': archive.name,
  643. 'archiveid': archive.fpr,
  644. }
  645. static_keys.update(self.FIXED_KEYS)
  646. if self.json_lines:
  647. self.item_data = {}
  648. self.format_item = self.format_item_json
  649. else:
  650. self.item_data = static_keys
  651. self.format = partial_format(format, static_keys)
  652. self.format_keys = {f[1] for f in Formatter().parse(format)}
  653. self.call_keys = {
  654. 'size': self.calculate_size,
  655. 'dsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.size),
  656. 'num_chunks': self.calculate_num_chunks,
  657. 'unique_chunks': partial(self.sum_unique_chunks_metadata, lambda chunk: 1),
  658. 'isomtime': partial(self.format_iso_time, 'mtime'),
  659. 'isoctime': partial(self.format_iso_time, 'ctime'),
  660. 'isoatime': partial(self.format_iso_time, 'atime'),
  661. 'mtime': partial(self.format_time, 'mtime'),
  662. 'ctime': partial(self.format_time, 'ctime'),
  663. 'atime': partial(self.format_time, 'atime'),
  664. }
  665. for hash_function in self.hash_algorithms:
  666. self.call_keys[hash_function] = partial(self.hash_item, hash_function)
  667. self.used_call_keys = set(self.call_keys) & self.format_keys
  668. def format_item_json(self, item):
  669. return json.dumps(self.get_item_data(item), cls=BorgJsonEncoder) + '\n'
  670. def get_item_data(self, item):
  671. item_data = {}
  672. item_data.update(self.item_data)
  673. mode = stat.filemode(item.mode)
  674. item_type = mode[0]
  675. source = item.get('source', '')
  676. extra = ''
  677. if source:
  678. source = remove_surrogates(source)
  679. extra = ' -> %s' % source
  680. hlid = item.get('hlid')
  681. hlid = bin_to_hex(hlid) if hlid else ''
  682. item_data['type'] = item_type
  683. item_data['mode'] = mode
  684. item_data['user'] = item.get('user', str(item.uid))
  685. item_data['group'] = item.get('group', str(item.gid))
  686. item_data['uid'] = item.uid
  687. item_data['gid'] = item.gid
  688. item_data['path'] = remove_surrogates(item.path)
  689. if self.json_lines:
  690. item_data['healthy'] = 'chunks_healthy' not in item
  691. else:
  692. item_data['bpath'] = item.path
  693. item_data['extra'] = extra
  694. item_data['health'] = 'broken' if 'chunks_healthy' in item else 'healthy'
  695. item_data['source'] = source
  696. item_data['linktarget'] = source
  697. item_data['hlid'] = hlid
  698. item_data['flags'] = item.get('bsdflags')
  699. for key in self.used_call_keys:
  700. item_data[key] = self.call_keys[key](item)
  701. return item_data
  702. def sum_unique_chunks_metadata(self, metadata_func, item):
  703. """
  704. sum unique chunks metadata, a unique chunk is a chunk which is referenced globally as often as it is in the
  705. item
  706. item: The item to sum its unique chunks' metadata
  707. metadata_func: A function that takes a parameter of type ChunkIndexEntry and returns a number, used to return
  708. the metadata needed from the chunk
  709. """
  710. chunk_index = self.archive.cache.chunks
  711. chunks = item.get('chunks', [])
  712. chunks_counter = Counter(c.id for c in chunks)
  713. return sum(metadata_func(c) for c in chunks if chunk_index[c.id].refcount == chunks_counter[c.id])
  714. def calculate_num_chunks(self, item):
  715. return len(item.get('chunks', []))
  716. def calculate_size(self, item):
  717. # note: does not support hardlink slaves, they will be size 0
  718. return item.get_size()
  719. def hash_item(self, hash_function, item):
  720. if 'chunks' not in item:
  721. return ""
  722. if hash_function == 'xxh64':
  723. hash = self.xxh64()
  724. elif hash_function in self.hash_algorithms:
  725. hash = hashlib.new(hash_function)
  726. for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks]):
  727. hash.update(data)
  728. return hash.hexdigest()
  729. def format_time(self, key, item):
  730. return OutputTimestamp(safe_timestamp(item.get(key) or item.mtime))
  731. def format_iso_time(self, key, item):
  732. return self.format_time(key, item).isoformat()
  733. def file_status(mode):
  734. if stat.S_ISREG(mode):
  735. return 'A'
  736. elif stat.S_ISDIR(mode):
  737. return 'd'
  738. elif stat.S_ISBLK(mode):
  739. return 'b'
  740. elif stat.S_ISCHR(mode):
  741. return 'c'
  742. elif stat.S_ISLNK(mode):
  743. return 's'
  744. elif stat.S_ISFIFO(mode):
  745. return 'f'
  746. return '?'
  747. def clean_lines(lines, lstrip=None, rstrip=None, remove_empty=True, remove_comments=True):
  748. """
  749. clean lines (usually read from a config file):
  750. 1. strip whitespace (left and right), 2. remove empty lines, 3. remove comments.
  751. note: only "pure comment lines" are supported, no support for "trailing comments".
  752. :param lines: input line iterator (e.g. list or open text file) that gives unclean input lines
  753. :param lstrip: lstrip call arguments or False, if lstripping is not desired
  754. :param rstrip: rstrip call arguments or False, if rstripping is not desired
  755. :param remove_comments: remove comment lines (lines starting with "#")
  756. :param remove_empty: remove empty lines
  757. :return: yields processed lines
  758. """
  759. for line in lines:
  760. if lstrip is not False:
  761. line = line.lstrip(lstrip)
  762. if rstrip is not False:
  763. line = line.rstrip(rstrip)
  764. if remove_empty and not line:
  765. continue
  766. if remove_comments and line.startswith('#'):
  767. continue
  768. yield line
  769. def swidth_slice(string, max_width):
  770. """
  771. Return a slice of *max_width* cells from *string*.
  772. Negative *max_width* means from the end of string.
  773. *max_width* is in units of character cells (or "columns").
  774. Latin characters are usually one cell wide, many CJK characters are two cells wide.
  775. """
  776. from ..platform import swidth
  777. reverse = max_width < 0
  778. max_width = abs(max_width)
  779. if reverse:
  780. string = reversed(string)
  781. current_swidth = 0
  782. result = []
  783. for character in string:
  784. current_swidth += swidth(character)
  785. if current_swidth > max_width:
  786. break
  787. result.append(character)
  788. if reverse:
  789. result.reverse()
  790. return ''.join(result)
  791. def ellipsis_truncate(msg, space):
  792. """
  793. shorten a long string by adding ellipsis between it and return it, example:
  794. this_is_a_very_long_string -------> this_is..._string
  795. """
  796. from ..platform import swidth
  797. ellipsis_width = swidth('...')
  798. msg_width = swidth(msg)
  799. if space < 8:
  800. # if there is very little space, just show ...
  801. return '...' + ' ' * (space - ellipsis_width)
  802. if space < ellipsis_width + msg_width:
  803. return '{}...{}'.format(swidth_slice(msg, space // 2 - ellipsis_width),
  804. swidth_slice(msg, -space // 2))
  805. return msg + ' ' * (space - msg_width)
  806. class BorgJsonEncoder(json.JSONEncoder):
  807. def default(self, o):
  808. from ..repository import Repository
  809. from ..remote import RemoteRepository
  810. from ..archive import Archive
  811. from ..cache import LocalCache, AdHocCache
  812. if isinstance(o, Repository) or isinstance(o, RemoteRepository):
  813. return {
  814. 'id': bin_to_hex(o.id),
  815. 'location': o._location.canonical_path(),
  816. }
  817. if isinstance(o, Archive):
  818. return o.info()
  819. if isinstance(o, LocalCache):
  820. return {
  821. 'path': o.path,
  822. 'stats': o.stats(),
  823. }
  824. if isinstance(o, AdHocCache):
  825. return {
  826. 'stats': o.stats(),
  827. }
  828. if callable(getattr(o, 'to_json', None)):
  829. return o.to_json()
  830. return super().default(o)
  831. def basic_json_data(manifest, *, cache=None, extra=None):
  832. key = manifest.key
  833. data = extra or {}
  834. data.update({
  835. 'repository': BorgJsonEncoder().default(manifest.repository),
  836. 'encryption': {
  837. 'mode': key.ARG_NAME,
  838. },
  839. })
  840. data['repository']['last_modified'] = OutputTimestamp(manifest.last_timestamp.replace(tzinfo=timezone.utc))
  841. if key.NAME.startswith('key file'):
  842. data['encryption']['keyfile'] = key.find_key()
  843. if cache:
  844. data['cache'] = cache
  845. return data
  846. def json_dump(obj):
  847. """Dump using BorgJSONEncoder."""
  848. return json.dumps(obj, sort_keys=True, indent=4, cls=BorgJsonEncoder)
  849. def json_print(obj):
  850. print(json_dump(obj))
  851. def prepare_dump_dict(d):
  852. def decode_bytes(value):
  853. # this should somehow be reversible later, but usual strings should
  854. # look nice and chunk ids should mostly show in hex. Use a special
  855. # inband signaling character (ASCII DEL) to distinguish between
  856. # decoded and hex mode.
  857. if not value.startswith(b'\x7f'):
  858. try:
  859. value = value.decode()
  860. return value
  861. except UnicodeDecodeError:
  862. pass
  863. return '\u007f' + bin_to_hex(value)
  864. def decode_tuple(t):
  865. res = []
  866. for value in t:
  867. if isinstance(value, dict):
  868. value = decode(value)
  869. elif isinstance(value, tuple) or isinstance(value, list):
  870. value = decode_tuple(value)
  871. elif isinstance(value, bytes):
  872. value = decode_bytes(value)
  873. res.append(value)
  874. return res
  875. def decode(d):
  876. res = OrderedDict()
  877. for key, value in d.items():
  878. if isinstance(value, dict):
  879. value = decode(value)
  880. elif isinstance(value, (tuple, list)):
  881. value = decode_tuple(value)
  882. elif isinstance(value, bytes):
  883. value = decode_bytes(value)
  884. elif isinstance(value, Timestamp):
  885. value = value.to_unix_nano()
  886. if isinstance(key, bytes):
  887. key = key.decode()
  888. res[key] = value
  889. return res
  890. return decode(d)