|
@@ -25,7 +25,7 @@ from .locking import Lock, LockError, LockErrorT
|
|
|
from .logger import create_logger
|
|
|
from .lrucache import LRUCache
|
|
|
from .platform import SaveFile, SyncFile, sync_dir, safe_fadvise
|
|
|
-from .checksums import crc32
|
|
|
+from .checksums import crc32, StreamingXXH64
|
|
|
from .crypto.file_integrity import IntegrityCheckedFile, FileIntegrityError
|
|
|
|
|
|
logger = create_logger(__name__)
|
|
@@ -34,9 +34,11 @@ MAGIC = b'BORG_SEG'
|
|
|
MAGIC_LEN = len(MAGIC)
|
|
|
ATTIC_MAGIC = b'ATTICSEG'
|
|
|
assert len(ATTIC_MAGIC) == MAGIC_LEN
|
|
|
+
|
|
|
TAG_PUT = 0
|
|
|
TAG_DELETE = 1
|
|
|
TAG_COMMIT = 2
|
|
|
+TAG_PUT2 = 3
|
|
|
|
|
|
# Highest ID usable as TAG_* value
|
|
|
#
|
|
@@ -788,7 +790,7 @@ class Repository:
|
|
|
continue
|
|
|
in_index = self.index.get(key)
|
|
|
is_index_object = in_index == (segment, offset)
|
|
|
- if tag == TAG_PUT and is_index_object:
|
|
|
+ if tag in (TAG_PUT2, TAG_PUT) and is_index_object:
|
|
|
try:
|
|
|
new_segment, offset = self.io.write_put(key, data, raise_full=True)
|
|
|
except LoggedIO.SegmentFull:
|
|
@@ -798,7 +800,10 @@ class Repository:
|
|
|
segments.setdefault(new_segment, 0)
|
|
|
segments[new_segment] += 1
|
|
|
segments[segment] -= 1
|
|
|
- elif tag == TAG_PUT and not is_index_object:
|
|
|
+ if tag == TAG_PUT:
|
|
|
+ # old tag is PUT, but new will be PUT2 and use a bit more storage
|
|
|
+ self.storage_quota_use += self.io.ENTRY_HASH_SIZE
|
|
|
+ elif tag in (TAG_PUT2, TAG_PUT) and not is_index_object:
|
|
|
# If this is a PUT shadowed by a later tag, then it will be gone when this segment is deleted after
|
|
|
# this loop. Therefore it is removed from the shadow index.
|
|
|
try:
|
|
@@ -807,7 +812,10 @@ class Repository:
|
|
|
# do not remove entry with empty shadowed_segments list here,
|
|
|
# it is needed for shadowed_put_exists code (see below)!
|
|
|
pass
|
|
|
- self.storage_quota_use -= len(data) + self.io.HEADER_ID_SIZE
|
|
|
+ if tag == TAG_PUT2:
|
|
|
+ self.storage_quota_use -= len(data) + self.io.HEADER_ID_SIZE + self.io.ENTRY_HASH_SIZE
|
|
|
+ elif tag == TAG_PUT:
|
|
|
+ self.storage_quota_use -= len(data) + self.io.HEADER_ID_SIZE
|
|
|
elif tag == TAG_DELETE and not in_index:
|
|
|
# If the shadow index doesn't contain this key, then we can't say if there's a shadowed older tag,
|
|
|
# therefore we do not drop the delete, but write it to a current segment.
|
|
@@ -830,7 +838,7 @@ class Repository:
|
|
|
# Consider the following series of operations if we would not do this, ie. this entire if:
|
|
|
# would be removed.
|
|
|
# Columns are segments, lines are different keys (line 1 = some key, line 2 = some other key)
|
|
|
- # Legend: P=TAG_PUT, D=TAG_DELETE, c=commit, i=index is written for latest commit
|
|
|
+ # Legend: P=TAG_PUT/TAG_PUT2, D=TAG_DELETE, c=commit, i=index is written for latest commit
|
|
|
#
|
|
|
# Segment | 1 | 2 | 3
|
|
|
# --------+-------+-----+------
|
|
@@ -899,7 +907,7 @@ class Repository:
|
|
|
"""some code shared between replay_segments and check"""
|
|
|
self.segments[segment] = 0
|
|
|
for tag, key, offset, size in objects:
|
|
|
- if tag == TAG_PUT:
|
|
|
+ if tag in (TAG_PUT2, TAG_PUT):
|
|
|
try:
|
|
|
# If this PUT supersedes an older PUT, mark the old segment for compaction and count the free space
|
|
|
s, _ = self.index[key]
|
|
@@ -950,7 +958,7 @@ class Repository:
|
|
|
|
|
|
self.compact[segment] = 0
|
|
|
for tag, key, offset, size in self.io.iter_objects(segment, read_data=False):
|
|
|
- if tag == TAG_PUT:
|
|
|
+ if tag in (TAG_PUT2, TAG_PUT):
|
|
|
if self.index.get(key, (-1, -1)) != (segment, offset):
|
|
|
# This PUT is superseded later
|
|
|
self.compact[segment] += size
|
|
@@ -1169,7 +1177,7 @@ class Repository:
|
|
|
# also, for the next segment, we need to start at offset 0.
|
|
|
start_offset = 0
|
|
|
continue
|
|
|
- if tag == TAG_PUT and (segment, offset) == self.index.get(id):
|
|
|
+ if tag in (TAG_PUT2, TAG_PUT) and (segment, offset) == self.index.get(id):
|
|
|
# we have found an existing and current object
|
|
|
result.append(id)
|
|
|
if len(result) == limit:
|
|
@@ -1208,7 +1216,7 @@ class Repository:
|
|
|
# be in the repo index (and we won't need it in the shadow_index).
|
|
|
self._delete(id, segment, offset, update_shadow_index=False)
|
|
|
segment, offset = self.io.write_put(id, data)
|
|
|
- self.storage_quota_use += len(data) + self.io.HEADER_ID_SIZE
|
|
|
+ self.storage_quota_use += len(data) + self.io.HEADER_ID_SIZE + self.io.ENTRY_HASH_SIZE
|
|
|
self.segments.setdefault(segment, 0)
|
|
|
self.segments[segment] += 1
|
|
|
self.index[id] = segment, offset
|
|
@@ -1278,6 +1286,7 @@ class LoggedIO:
|
|
|
COMMIT = crc_fmt.pack(crc32(_commit)) + _commit
|
|
|
|
|
|
HEADER_ID_SIZE = header_fmt.size + 32
|
|
|
+ ENTRY_HASH_SIZE = 8
|
|
|
|
|
|
def __init__(self, path, limit, segments_per_dir, capacity=90):
|
|
|
self.path = path
|
|
@@ -1475,7 +1484,8 @@ class LoggedIO:
|
|
|
Return object iterator for *segment*.
|
|
|
|
|
|
If read_data is False then include_data must be False as well.
|
|
|
- Integrity checks are skipped: all data obtained from the iterator must be considered informational.
|
|
|
+
|
|
|
+ See the _read() docstring about confidence in the returned data.
|
|
|
|
|
|
The iterator returns four-tuples of (tag, key, offset, data|size).
|
|
|
"""
|
|
@@ -1491,7 +1501,7 @@ class LoggedIO:
|
|
|
header = fd.read(self.header_fmt.size)
|
|
|
while header:
|
|
|
size, tag, key, data = self._read(fd, header, segment, offset,
|
|
|
- (TAG_PUT, TAG_DELETE, TAG_COMMIT),
|
|
|
+ (TAG_PUT2, TAG_DELETE, TAG_COMMIT, TAG_PUT),
|
|
|
read_data=read_data)
|
|
|
if include_data:
|
|
|
yield tag, key, offset, data
|
|
@@ -1528,8 +1538,25 @@ class LoggedIO:
|
|
|
dst_fd.write(MAGIC)
|
|
|
while len(d) >= self.header_fmt.size:
|
|
|
crc, size, tag = self.header_fmt.unpack(d[:self.header_fmt.size])
|
|
|
- if size > MAX_OBJECT_SIZE or tag > MAX_TAG_ID or size < self.header_fmt.size \
|
|
|
- or size > len(d) or crc32(d[4:size]) & 0xffffffff != crc:
|
|
|
+ size_invalid = size > MAX_OBJECT_SIZE or size < self.header_fmt.size or size > len(d)
|
|
|
+ if size_invalid or tag > MAX_TAG_ID:
|
|
|
+ d = d[1:]
|
|
|
+ continue
|
|
|
+ if tag == TAG_PUT2:
|
|
|
+ c_offset = self.HEADER_ID_SIZE + self.ENTRY_HASH_SIZE
|
|
|
+ # skip if header is invalid
|
|
|
+ if crc32(d[4:c_offset]) & 0xffffffff != crc:
|
|
|
+ d = d[1:]
|
|
|
+ continue
|
|
|
+ # skip if content is invalid
|
|
|
+ if self.entry_hash(d[4:self.HEADER_ID_SIZE], d[c_offset:size]) != d[self.HEADER_ID_SIZE:c_offset]:
|
|
|
+ d = d[1:]
|
|
|
+ continue
|
|
|
+ elif tag in (TAG_DELETE, TAG_COMMIT, TAG_PUT):
|
|
|
+ if crc32(d[4:size]) & 0xffffffff != crc:
|
|
|
+ d = d[1:]
|
|
|
+ continue
|
|
|
+ else: # tag unknown
|
|
|
d = d[1:]
|
|
|
continue
|
|
|
dst_fd.write(d[:size])
|
|
@@ -1538,72 +1565,108 @@ class LoggedIO:
|
|
|
del d
|
|
|
data.release()
|
|
|
|
|
|
+ def entry_hash(self, *data):
|
|
|
+ h = StreamingXXH64()
|
|
|
+ for d in data:
|
|
|
+ h.update(d)
|
|
|
+ return h.digest()
|
|
|
+
|
|
|
def read(self, segment, offset, id, read_data=True):
|
|
|
"""
|
|
|
Read entry from *segment* at *offset* with *id*.
|
|
|
+ If read_data is False the size of the entry is returned instead.
|
|
|
|
|
|
- If read_data is False the size of the entry is returned instead and integrity checks are skipped.
|
|
|
- The return value should thus be considered informational.
|
|
|
+ See the _read() docstring about confidence in the returned data.
|
|
|
"""
|
|
|
if segment == self.segment and self._write_fd:
|
|
|
self._write_fd.sync()
|
|
|
fd = self.get_fd(segment)
|
|
|
fd.seek(offset)
|
|
|
header = fd.read(self.header_fmt.size)
|
|
|
- size, tag, key, data = self._read(fd, header, segment, offset, (TAG_PUT,), read_data)
|
|
|
+ size, tag, key, data = self._read(fd, header, segment, offset, (TAG_PUT2, TAG_PUT), read_data)
|
|
|
if id != key:
|
|
|
raise IntegrityError('Invalid segment entry header, is not for wanted id [segment {}, offset {}]'.format(
|
|
|
segment, offset))
|
|
|
return data if read_data else size
|
|
|
|
|
|
def _read(self, fd, header, segment, offset, acceptable_tags, read_data=True):
|
|
|
- # some code shared by read() and iter_objects()
|
|
|
+ """
|
|
|
+ Code shared by read() and iter_objects().
|
|
|
+
|
|
|
+ Confidence in returned data:
|
|
|
+ PUT2 tags, read_data == True: crc32 check (header) plus digest check (header+data)
|
|
|
+ PUT2 tags, read_data == False: crc32 check (header)
|
|
|
+ PUT tags, read_data == True: crc32 check (header+data)
|
|
|
+ PUT tags, read_data == False: crc32 check can not be done, all data obtained must be considered informational
|
|
|
+ """
|
|
|
+ def check_crc32(wanted, header, *data):
|
|
|
+ result = crc32(memoryview(header)[4:]) # skip first 32 bits of the header, they contain the crc.
|
|
|
+ for d in data:
|
|
|
+ result = crc32(d, result)
|
|
|
+ if result & 0xffffffff != wanted:
|
|
|
+ raise IntegrityError(f'Segment entry header checksum mismatch [segment {segment}, offset {offset}]')
|
|
|
+
|
|
|
# See comment on MAX_TAG_ID for details
|
|
|
assert max(acceptable_tags) <= MAX_TAG_ID, 'Exceeding MAX_TAG_ID will break backwards compatibility'
|
|
|
+ key = data = None
|
|
|
fmt = self.header_fmt
|
|
|
try:
|
|
|
hdr_tuple = fmt.unpack(header)
|
|
|
except struct.error as err:
|
|
|
- raise IntegrityError('Invalid segment entry header [segment {}, offset {}]: {}'.format(
|
|
|
- segment, offset, err)) from None
|
|
|
+ raise IntegrityError(f'Invalid segment entry header [segment {segment}, offset {offset}]: {err}') from None
|
|
|
crc, size, tag = hdr_tuple
|
|
|
length = size - fmt.size # we already read the header
|
|
|
if size > MAX_OBJECT_SIZE:
|
|
|
# if you get this on an archive made with borg < 1.0.7 and millions of files and
|
|
|
# you need to restore it, you can disable this check by using "if False:" above.
|
|
|
- raise IntegrityError('Invalid segment entry size {} - too big [segment {}, offset {}]'.format(
|
|
|
- size, segment, offset))
|
|
|
+ raise IntegrityError(f'Invalid segment entry size {size} - too big [segment {segment}, offset {offset}]')
|
|
|
if size < fmt.size:
|
|
|
- raise IntegrityError('Invalid segment entry size {} - too small [segment {}, offset {}]'.format(
|
|
|
- size, segment, offset))
|
|
|
- if tag in (TAG_PUT, TAG_DELETE):
|
|
|
+ raise IntegrityError(f'Invalid segment entry size {size} - too small [segment {segment}, offset {offset}]')
|
|
|
+ if tag not in (TAG_PUT2, TAG_DELETE, TAG_COMMIT, TAG_PUT):
|
|
|
+ raise IntegrityError(f'Invalid segment entry header, did not get a known tag '
|
|
|
+ f'[segment {segment}, offset {offset}]')
|
|
|
+ if tag not in acceptable_tags:
|
|
|
+ raise IntegrityError(f'Invalid segment entry header, did not get acceptable tag '
|
|
|
+ f'[segment {segment}, offset {offset}]')
|
|
|
+ if tag == TAG_COMMIT:
|
|
|
+ check_crc32(crc, header)
|
|
|
+ # that's all for COMMITs.
|
|
|
+ else:
|
|
|
+ # all other tags (TAG_PUT2, TAG_DELETE, TAG_PUT) have a key
|
|
|
key = fd.read(32)
|
|
|
length -= 32
|
|
|
if len(key) != 32:
|
|
|
- raise IntegrityError(
|
|
|
- 'Segment entry key short read [segment {}, offset {}]: expected {}, got {} bytes'.format(
|
|
|
- segment, offset, 32, len(key)))
|
|
|
- else:
|
|
|
- key = None
|
|
|
- if read_data and tag == TAG_PUT:
|
|
|
- data = fd.read(length)
|
|
|
- if len(data) != length:
|
|
|
- raise IntegrityError('Segment entry data short read [segment {}, offset {}]: expected {}, got {} bytes'.format(
|
|
|
- segment, offset, length, len(data)))
|
|
|
- if crc32(data, crc32(key, crc32(memoryview(header)[4:]))) & 0xffffffff != crc:
|
|
|
- raise IntegrityError('Segment entry checksum mismatch [segment {}, offset {}]'.format(
|
|
|
- segment, offset))
|
|
|
- else:
|
|
|
- data = None
|
|
|
- if length > 0:
|
|
|
- oldpos = fd.tell()
|
|
|
- seeked = fd.seek(length, os.SEEK_CUR) - oldpos
|
|
|
- if seeked != length:
|
|
|
- raise IntegrityError('Segment entry data short seek [segment {}, offset {}]: expected {}, got {} bytes'.format(
|
|
|
- segment, offset, length, seeked))
|
|
|
- if tag not in acceptable_tags:
|
|
|
- raise IntegrityError('Invalid segment entry header, did not get acceptable tag [segment {}, offset {}]'.format(
|
|
|
- segment, offset))
|
|
|
+ raise IntegrityError(f'Segment entry key short read [segment {segment}, offset {offset}]: '
|
|
|
+ f'expected {32}, got {len(key)} bytes')
|
|
|
+ if tag == TAG_DELETE:
|
|
|
+ check_crc32(crc, header, key)
|
|
|
+ # that's all for DELETEs.
|
|
|
+ else:
|
|
|
+ # TAG_PUT: we can not do a crc32 header check here, because the crc32 is computed over header+data!
|
|
|
+ # for the check, see code below when read_data is True.
|
|
|
+ if tag == TAG_PUT2:
|
|
|
+ entry_hash = fd.read(self.ENTRY_HASH_SIZE)
|
|
|
+ length -= self.ENTRY_HASH_SIZE
|
|
|
+ if len(entry_hash) != self.ENTRY_HASH_SIZE:
|
|
|
+ raise IntegrityError(f'Segment entry hash short read [segment {segment}, offset {offset}]: '
|
|
|
+ f'expected {self.ENTRY_HASH_SIZE}, got {len(entry_hash)} bytes')
|
|
|
+ check_crc32(crc, header, key, entry_hash)
|
|
|
+ if not read_data: # seek over data
|
|
|
+ oldpos = fd.tell()
|
|
|
+ seeked = fd.seek(length, os.SEEK_CUR) - oldpos
|
|
|
+ if seeked != length:
|
|
|
+ raise IntegrityError(f'Segment entry data short seek [segment {segment}, offset {offset}]: '
|
|
|
+ f'expected {length}, got {seeked} bytes')
|
|
|
+ else: # read data!
|
|
|
+ data = fd.read(length)
|
|
|
+ if len(data) != length:
|
|
|
+ raise IntegrityError(f'Segment entry data short read [segment {segment}, offset {offset}]: '
|
|
|
+ f'expected {length}, got {len(data)} bytes')
|
|
|
+ if tag == TAG_PUT2:
|
|
|
+ if self.entry_hash(memoryview(header)[4:], key, data) != entry_hash:
|
|
|
+ raise IntegrityError(f'Segment entry hash mismatch [segment {segment}, offset {offset}]')
|
|
|
+ elif tag == TAG_PUT:
|
|
|
+ check_crc32(crc, header, key, data)
|
|
|
return size, tag, key, data
|
|
|
|
|
|
def write_put(self, id, data, raise_full=False):
|
|
@@ -1612,11 +1675,13 @@ class LoggedIO:
|
|
|
# this would push the segment entry size beyond MAX_OBJECT_SIZE.
|
|
|
raise IntegrityError(f'More than allowed put data [{data_size} > {MAX_DATA_SIZE}]')
|
|
|
fd = self.get_write_fd(want_new=(id == Manifest.MANIFEST_ID), raise_full=raise_full)
|
|
|
- size = data_size + self.HEADER_ID_SIZE
|
|
|
+ size = data_size + self.HEADER_ID_SIZE + self.ENTRY_HASH_SIZE
|
|
|
offset = self.offset
|
|
|
- header = self.header_no_crc_fmt.pack(size, TAG_PUT)
|
|
|
- crc = self.crc_fmt.pack(crc32(data, crc32(id, crc32(header))) & 0xffffffff)
|
|
|
- fd.write(b''.join((crc, header, id, data)))
|
|
|
+ header = self.header_no_crc_fmt.pack(size, TAG_PUT2)
|
|
|
+ entry_hash = self.entry_hash(header, id, data)
|
|
|
+ crc = self.crc_fmt.pack(crc32(entry_hash, crc32(id, crc32(header))) & 0xffffffff)
|
|
|
+ fd.write(b''.join((crc, header, id, entry_hash)))
|
|
|
+ fd.write(data)
|
|
|
self.offset += size
|
|
|
return self.segment, offset
|
|
|
|
|
@@ -1641,4 +1706,4 @@ class LoggedIO:
|
|
|
return self.segment - 1 # close_segment() increments it
|
|
|
|
|
|
|
|
|
-assert LoggedIO.HEADER_ID_SIZE == 41 # see constants.MAX_OBJECT_SIZE
|
|
|
+assert LoggedIO.HEADER_ID_SIZE + LoggedIO.ENTRY_HASH_SIZE == 41 + 8 # see constants.MAX_OBJECT_SIZE
|