|
@@ -536,6 +536,10 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
archive indexes.
|
|
archive indexes.
|
|
"""
|
|
"""
|
|
archive_path = os.path.join(self.path, 'chunks.archive.d')
|
|
archive_path = os.path.join(self.path, 'chunks.archive.d')
|
|
|
|
+ # Instrumentation
|
|
|
|
+ processed_item_metadata_bytes = 0
|
|
|
|
+ processed_item_metadata_chunks = 0
|
|
|
|
+ compact_chunks_archive_saved_space = 0
|
|
|
|
|
|
def mkpath(id, suffix=''):
|
|
def mkpath(id, suffix=''):
|
|
id_hex = bin_to_hex(id)
|
|
id_hex = bin_to_hex(id)
|
|
@@ -545,8 +549,10 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
def cached_archives():
|
|
def cached_archives():
|
|
if self.do_cache:
|
|
if self.do_cache:
|
|
fns = os.listdir(archive_path)
|
|
fns = os.listdir(archive_path)
|
|
- # filenames with 64 hex digits == 256bit
|
|
|
|
- return set(unhexlify(fn) for fn in fns if len(fn) == 64)
|
|
|
|
|
|
+ # filenames with 64 hex digits == 256bit,
|
|
|
|
+ # or compact indices which are 64 hex digits + ".compact"
|
|
|
|
+ return set(unhexlify(fn) for fn in fns if len(fn) == 64) | \
|
|
|
|
+ set(unhexlify(fn[:64]) for fn in fns if len(fn) == 72 and fn.endswith('.compact'))
|
|
else:
|
|
else:
|
|
return set()
|
|
return set()
|
|
|
|
|
|
@@ -557,14 +563,23 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
for id in ids:
|
|
for id in ids:
|
|
cleanup_cached_archive(id)
|
|
cleanup_cached_archive(id)
|
|
|
|
|
|
- def cleanup_cached_archive(id):
|
|
|
|
- os.unlink(mkpath(id))
|
|
|
|
|
|
+ def cleanup_cached_archive(id, cleanup_compact=True):
|
|
try:
|
|
try:
|
|
|
|
+ os.unlink(mkpath(id))
|
|
os.unlink(mkpath(id) + '.integrity')
|
|
os.unlink(mkpath(id) + '.integrity')
|
|
except FileNotFoundError:
|
|
except FileNotFoundError:
|
|
pass
|
|
pass
|
|
|
|
+ if not cleanup_compact:
|
|
|
|
+ return
|
|
|
|
+ try:
|
|
|
|
+ os.unlink(mkpath(id, suffix='.compact'))
|
|
|
|
+ os.unlink(mkpath(id, suffix='.compact') + '.integrity')
|
|
|
|
+ except FileNotFoundError:
|
|
|
|
+ pass
|
|
|
|
|
|
def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
|
|
def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
|
|
|
|
+ nonlocal processed_item_metadata_bytes
|
|
|
|
+ nonlocal processed_item_metadata_chunks
|
|
csize, data = decrypted_repository.get(archive_id)
|
|
csize, data = decrypted_repository.get(archive_id)
|
|
chunk_idx.add(archive_id, 1, len(data), csize)
|
|
chunk_idx.add(archive_id, 1, len(data), csize)
|
|
archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
|
archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
|
@@ -573,18 +588,54 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
sync = CacheSynchronizer(chunk_idx)
|
|
sync = CacheSynchronizer(chunk_idx)
|
|
for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
|
|
for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
|
|
chunk_idx.add(item_id, 1, len(data), csize)
|
|
chunk_idx.add(item_id, 1, len(data), csize)
|
|
|
|
+ processed_item_metadata_bytes += len(data)
|
|
|
|
+ processed_item_metadata_chunks += 1
|
|
sync.feed(data)
|
|
sync.feed(data)
|
|
if self.do_cache:
|
|
if self.do_cache:
|
|
- fn = mkpath(archive_id)
|
|
|
|
- fn_tmp = mkpath(archive_id, suffix='.tmp')
|
|
|
|
|
|
+ write_archive_index(archive_id, chunk_idx)
|
|
|
|
+
|
|
|
|
+ def write_archive_index(archive_id, chunk_idx):
|
|
|
|
+ nonlocal compact_chunks_archive_saved_space
|
|
|
|
+ compact_chunks_archive_saved_space += chunk_idx.compact()
|
|
|
|
+ fn = mkpath(archive_id, suffix='.compact')
|
|
|
|
+ fn_tmp = mkpath(archive_id, suffix='.tmp')
|
|
|
|
+ try:
|
|
|
|
+ with DetachedIntegrityCheckedFile(path=fn_tmp, write=True,
|
|
|
|
+ filename=bin_to_hex(archive_id) + '.compact') as fd:
|
|
|
|
+ chunk_idx.write(fd)
|
|
|
|
+ except Exception:
|
|
|
|
+ os.unlink(fn_tmp)
|
|
|
|
+ else:
|
|
|
|
+ os.rename(fn_tmp, fn)
|
|
|
|
+
|
|
|
|
+ def read_archive_index(archive_id, archive_name):
|
|
|
|
+ archive_chunk_idx_path = mkpath(archive_id)
|
|
|
|
+ logger.info("Reading cached archive chunk index for %s ...", archive_name)
|
|
|
|
+ try:
|
|
try:
|
|
try:
|
|
- with DetachedIntegrityCheckedFile(path=fn_tmp, write=True,
|
|
|
|
- filename=bin_to_hex(archive_id)) as fd:
|
|
|
|
- chunk_idx.write(fd)
|
|
|
|
- except Exception:
|
|
|
|
- os.unlink(fn_tmp)
|
|
|
|
- else:
|
|
|
|
- os.rename(fn_tmp, fn)
|
|
|
|
|
|
+ # Attempt to load compact index first
|
|
|
|
+ with DetachedIntegrityCheckedFile(path=archive_chunk_idx_path + '.compact', write=False) as fd:
|
|
|
|
+ archive_chunk_idx = ChunkIndex.read(fd, permit_compact=True)
|
|
|
|
+ # In case a non-compact index exists, delete it.
|
|
|
|
+ cleanup_cached_archive(archive_id, cleanup_compact=False)
|
|
|
|
+ # Compact index read - return index, no conversion necessary (below).
|
|
|
|
+ return archive_chunk_idx
|
|
|
|
+ except FileNotFoundError:
|
|
|
|
+ # No compact index found, load non-compact index, and convert below.
|
|
|
|
+ with DetachedIntegrityCheckedFile(path=archive_chunk_idx_path, write=False) as fd:
|
|
|
|
+ archive_chunk_idx = ChunkIndex.read(fd)
|
|
|
|
+ except FileIntegrityError as fie:
|
|
|
|
+ logger.error('Cached archive chunk index of %s is corrupted: %s', archive_name, fie)
|
|
|
|
+ # Delete corrupted index, set warning. A new index must be build.
|
|
|
|
+ cleanup_cached_archive(archive_id)
|
|
|
|
+ set_ec(EXIT_WARNING)
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+ # Convert to compact index. Delete the existing index first.
|
|
|
|
+ logger.debug('Found non-compact index for %s, converting to compact.', archive_name)
|
|
|
|
+ cleanup_cached_archive(archive_id)
|
|
|
|
+ write_archive_index(archive_id, archive_chunk_idx)
|
|
|
|
+ return archive_chunk_idx
|
|
|
|
|
|
def get_archive_ids_to_names(archive_ids):
|
|
def get_archive_ids_to_names(archive_ids):
|
|
# Pass once over all archives and build a mapping from ids to names.
|
|
# Pass once over all archives and build a mapping from ids to names.
|
|
@@ -612,7 +663,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
# due to hash table "resonance".
|
|
# due to hash table "resonance".
|
|
master_index_capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR)
|
|
master_index_capacity = int(len(self.repository) / ChunkIndex.MAX_LOAD_FACTOR)
|
|
if archive_ids:
|
|
if archive_ids:
|
|
- chunk_idx = None
|
|
|
|
|
|
+ chunk_idx = None if not self.do_cache else ChunkIndex(master_index_capacity)
|
|
pi = ProgressIndicatorPercent(total=len(archive_ids), step=0.1,
|
|
pi = ProgressIndicatorPercent(total=len(archive_ids), step=0.1,
|
|
msg='%3.0f%% Syncing chunks cache. Processing archive %s',
|
|
msg='%3.0f%% Syncing chunks cache. Processing archive %s',
|
|
msgid='cache.sync')
|
|
msgid='cache.sync')
|
|
@@ -621,17 +672,9 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
pi.show(info=[remove_surrogates(archive_name)])
|
|
pi.show(info=[remove_surrogates(archive_name)])
|
|
if self.do_cache:
|
|
if self.do_cache:
|
|
if archive_id in cached_ids:
|
|
if archive_id in cached_ids:
|
|
- archive_chunk_idx_path = mkpath(archive_id)
|
|
|
|
- logger.info("Reading cached archive chunk index for %s ...", archive_name)
|
|
|
|
- try:
|
|
|
|
- with DetachedIntegrityCheckedFile(path=archive_chunk_idx_path, write=False) as fd:
|
|
|
|
- archive_chunk_idx = ChunkIndex.read(fd)
|
|
|
|
- except FileIntegrityError as fie:
|
|
|
|
- logger.error('Cached archive chunk index of %s is corrupted: %s', archive_name, fie)
|
|
|
|
- # Delete it and fetch a new index
|
|
|
|
- cleanup_cached_archive(archive_id)
|
|
|
|
|
|
+ archive_chunk_idx = read_archive_index(archive_id, archive_name)
|
|
|
|
+ if archive_chunk_idx is None:
|
|
cached_ids.remove(archive_id)
|
|
cached_ids.remove(archive_id)
|
|
- set_ec(EXIT_WARNING)
|
|
|
|
if archive_id not in cached_ids:
|
|
if archive_id not in cached_ids:
|
|
# Do not make this an else branch; the FileIntegrityError exception handler
|
|
# Do not make this an else branch; the FileIntegrityError exception handler
|
|
# above can remove *archive_id* from *cached_ids*.
|
|
# above can remove *archive_id* from *cached_ids*.
|
|
@@ -639,18 +682,16 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}"""
|
|
archive_chunk_idx = ChunkIndex()
|
|
archive_chunk_idx = ChunkIndex()
|
|
fetch_and_build_idx(archive_id, decrypted_repository, archive_chunk_idx)
|
|
fetch_and_build_idx(archive_id, decrypted_repository, archive_chunk_idx)
|
|
logger.info("Merging into master chunks index ...")
|
|
logger.info("Merging into master chunks index ...")
|
|
- if chunk_idx is None:
|
|
|
|
- # we just use the first archive's idx as starting point,
|
|
|
|
- # to avoid growing the hash table from 0 size and also
|
|
|
|
- # to save 1 merge call.
|
|
|
|
- chunk_idx = archive_chunk_idx
|
|
|
|
- else:
|
|
|
|
- chunk_idx.merge(archive_chunk_idx)
|
|
|
|
|
|
+ chunk_idx.merge(archive_chunk_idx)
|
|
else:
|
|
else:
|
|
chunk_idx = chunk_idx or ChunkIndex(master_index_capacity)
|
|
chunk_idx = chunk_idx or ChunkIndex(master_index_capacity)
|
|
logger.info('Fetching archive index for %s ...', archive_name)
|
|
logger.info('Fetching archive index for %s ...', archive_name)
|
|
fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx)
|
|
fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx)
|
|
pi.finish()
|
|
pi.finish()
|
|
|
|
+ logger.debug('Cache sync: processed %s bytes (%d chunks) of metadata',
|
|
|
|
+ format_file_size(processed_item_metadata_bytes), processed_item_metadata_chunks)
|
|
|
|
+ logger.debug('Cache sync: compact chunks.archive.d storage saved %s bytes',
|
|
|
|
+ format_file_size(compact_chunks_archive_saved_space))
|
|
logger.info('Done.')
|
|
logger.info('Done.')
|
|
return chunk_idx
|
|
return chunk_idx
|
|
|
|
|