Browse Source

Merge pull request #559 from ThomasWaldmann/metastream-chunker

finer chunker granularity for items metadata stream, fixes #547, fixes #487
TW 9 years ago
parent
commit
f35ba0b577
2 changed files with 11 additions and 6 deletions
  1. 6 3
      borg/archive.py
  2. 5 3
      docs/internals.rst

+ 6 - 3
borg/archive.py

@@ -34,6 +34,9 @@ HASH_MASK_BITS = 16  # results in ~64kiB chunks statistically
 # defaults, use --chunker-params to override
 # defaults, use --chunker-params to override
 CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 
 
+# chunker params for the items metadata stream, finer granularity
+ITEMS_CHUNKER_PARAMS = (12, 16, 14, HASH_WINDOW_SIZE)
+
 utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
 utime_supports_fd = os.utime in getattr(os, 'supports_fd', {})
 utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
 utime_supports_follow_symlinks = os.utime in getattr(os, 'supports_follow_symlinks', {})
 has_mtime_ns = sys.version >= '3.3'
 has_mtime_ns = sys.version >= '3.3'
@@ -75,7 +78,7 @@ class DownloadPipeline:
 class ChunkBuffer:
 class ChunkBuffer:
     BUFFER_SIZE = 1 * 1024 * 1024
     BUFFER_SIZE = 1 * 1024 * 1024
 
 
-    def __init__(self, key, chunker_params=CHUNKER_PARAMS):
+    def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
         self.buffer = BytesIO()
         self.buffer = BytesIO()
         self.packer = msgpack.Packer(unicode_errors='surrogateescape')
         self.packer = msgpack.Packer(unicode_errors='surrogateescape')
         self.chunks = []
         self.chunks = []
@@ -110,7 +113,7 @@ class ChunkBuffer:
 
 
 class CacheChunkBuffer(ChunkBuffer):
 class CacheChunkBuffer(ChunkBuffer):
 
 
-    def __init__(self, cache, key, stats, chunker_params=CHUNKER_PARAMS):
+    def __init__(self, cache, key, stats, chunker_params=ITEMS_CHUNKER_PARAMS):
         super().__init__(key, chunker_params)
         super().__init__(key, chunker_params)
         self.cache = cache
         self.cache = cache
         self.stats = stats
         self.stats = stats
@@ -150,7 +153,7 @@ class Archive:
         self.end = end
         self.end = end
         self.pipeline = DownloadPipeline(self.repository, self.key)
         self.pipeline = DownloadPipeline(self.repository, self.key)
         if create:
         if create:
-            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats, chunker_params)
+            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
             self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
             self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
             if name in manifest.archives:
             if name in manifest.archives:
                 raise self.AlreadyExists(name)
                 raise self.AlreadyExists(name)

+ 5 - 3
docs/internals.rst

@@ -190,9 +190,11 @@ Each item represents a file, directory or other fs item and is stored as an
 it and it is reset every time an inode's metadata is changed.
 it and it is reset every time an inode's metadata is changed.
 
 
 All items are serialized using msgpack and the resulting byte stream
 All items are serialized using msgpack and the resulting byte stream
-is fed into the same chunker used for regular file data and turned
-into deduplicated chunks. The reference to these chunks is then added
-to the archive metadata.
+is fed into the same chunker algorithm as used for regular file data
+and turned into deduplicated chunks. The reference to these chunks is then added
+to the archive metadata. To achieve a finer granularity on this metadata
+stream, we use different chunker params for this chunker, which result in
+smaller chunks.
 
 
 A chunk is stored as an object as well, of course.
 A chunk is stored as an object as well, of course.