1 year ago · df4668754d
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,8 @@
 
				  * #860: Fix interaction between environment variable interpolation in constants and shell escaping.
			
 
				  * #863: When color output is disabled (explicitly or implicitly), don't prefix each log line with
			
 
				    the log level.
			
 
				+ * #866: Fix "Argument list too long" error in the "spot" check when checking hundreds of thousands
			
 
				+   of files at once.
			
 
				  * #874: Add the configured repository label as "repository_label" to the interpolated variables
			
 
				    passed to before/after command hooks.
			
 
				  * In the "spot" check, don't try to hash symlinked directories.
			
--- a/borgmatic/actions/check.py
+++ b/borgmatic/actions/check.py
@@ -387,6 +387,9 @@ def collect_spot_check_archive_paths(
 
				     )
			
 
				 
			
 
				 
			
 
				+SAMPLE_PATHS_SUBSET_COUNT = 10000
			
 
				+
			
 
				+
			
 
				 def compare_spot_check_hashes(
			
 
				     repository,
			
 
				     archive,
			
@@ -419,32 +422,57 @@ def compare_spot_check_hashes(
 
				         f'{log_label}: Sampling {sample_count} source paths (~{spot_check_config["data_sample_percentage"]}%) for spot check'
			
 
				     )
			
 
				 
			
 
				-    # Hash each file in the sample paths (if it exists).
			
 
				-    hash_output = borgmatic.execute.execute_command_and_capture_output(
			
 
				-        (spot_check_config.get('xxh64sum_command', 'xxh64sum'),)
			
 
				-        + tuple(path for path in source_sample_paths if path in existing_source_sample_paths)
			
 
				-    )
			
 
				+    source_sample_paths_iterator = iter(source_sample_paths)
			
 
				+    source_hashes = {}
			
 
				+    archive_hashes = {}
			
 
				 
			
 
				-    source_hashes = dict(
			
 
				-        (reversed(line.split('  ', 1)) for line in hash_output.splitlines()),
			
 
				-        **{path: '' for path in source_sample_paths if path not in existing_source_sample_paths},
			
 
				-    )
			
 
				+    # Only hash a few thousand files at a time (a subset of the total paths) to avoid an "Argument
			
 
				+    # list too long" shell error.
			
 
				+    while True:
			
 
				+        # Hash each file in the sample paths (if it exists).
			
 
				+        source_sample_paths_subset = tuple(
			
 
				+            itertools.islice(source_sample_paths_iterator, SAMPLE_PATHS_SUBSET_COUNT)
			
 
				+        )
			
 
				+        if not source_sample_paths_subset:
			
 
				+            break
			
 
				 
			
 
				-    archive_hashes = dict(
			
 
				-        reversed(line.split(' ', 1))
			
 
				-        for line in borgmatic.borg.list.capture_archive_listing(
			
 
				-            repository['path'],
			
 
				-            archive,
			
 
				-            config,
			
 
				-            local_borg_version,
			
 
				-            global_arguments,
			
 
				-            list_paths=source_sample_paths,
			
 
				-            path_format='{xxh64} /{path}{NL}',  # noqa: FS003
			
 
				-            local_path=local_path,
			
 
				-            remote_path=remote_path,
			
 
				+        hash_output = borgmatic.execute.execute_command_and_capture_output(
			
 
				+            (spot_check_config.get('xxh64sum_command', 'xxh64sum'),)
			
 
				+            + tuple(
			
 
				+                path for path in source_sample_paths_subset if path in existing_source_sample_paths
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        source_hashes.update(
			
 
				+            **dict(
			
 
				+                (reversed(line.split('  ', 1)) for line in hash_output.splitlines()),
			
 
				+                # Represent non-existent files as having empty hashes so the comparison below still works.
			
 
				+                **{
			
 
				+                    path: ''
			
 
				+                    for path in source_sample_paths_subset
			
 
				+                    if path not in existing_source_sample_paths
			
 
				+                },
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # Get the hash for each file in the archive.
			
 
				+        archive_hashes.update(
			
 
				+            **dict(
			
 
				+                reversed(line.split(' ', 1))
			
 
				+                for line in borgmatic.borg.list.capture_archive_listing(
			
 
				+                    repository['path'],
			
 
				+                    archive,
			
 
				+                    config,
			
 
				+                    local_borg_version,
			
 
				+                    global_arguments,
			
 
				+                    list_paths=source_sample_paths_subset,
			
 
				+                    path_format='{xxh64} /{path}{NL}',  # noqa: FS003
			
 
				+                    local_path=local_path,
			
 
				+                    remote_path=remote_path,
			
 
				+                )
			
 
				+                if line
			
 
				+            )
			
 
				         )
			
 
				-        if line
			
 
				-    )
			
 
				 
			
 
				     # Compare the source hashes with the archive hashes to see how many match.
			
 
				     failing_paths = []
			
--- a/tests/unit/actions/test_check.py
+++ b/tests/unit/actions/test_check.py
@@ -770,6 +770,46 @@ def test_compare_spot_check_hashes_considers_non_existent_path_as_not_matching()
 
				     ) == ('/bar',)
			
 
				 
			
 
				 
			
 
				+def test_compare_spot_check_hashes_with_too_many_paths_feeds_them_to_commands_in_chunks():
			
 
				+    flexmock(module).SAMPLE_PATHS_SUBSET_COUNT = 2
			
 
				+    flexmock(module.random).should_receive('sample').replace_with(
			
 
				+        lambda population, count: population[:count]
			
 
				+    )
			
 
				+    flexmock(module.os.path).should_receive('exists').and_return(True)
			
 
				+    flexmock(module.borgmatic.execute).should_receive(
			
 
				+        'execute_command_and_capture_output'
			
 
				+    ).with_args(('xxh64sum', '/foo', '/bar')).and_return('hash1  /foo\nhash2  /bar')
			
 
				+    flexmock(module.borgmatic.execute).should_receive(
			
 
				+        'execute_command_and_capture_output'
			
 
				+    ).with_args(('xxh64sum', '/baz', '/quux')).and_return('hash3  /baz\nhash4  /quux')
			
 
				+    flexmock(module.borgmatic.borg.list).should_receive('capture_archive_listing').and_return(
			
 
				+        ['hash1 /foo', 'hash2 /bar']
			
 
				+    ).and_return(['hash3 /baz', 'nothash4 /quux'])
			
 
				+
			
 
				+    assert module.compare_spot_check_hashes(
			
 
				+        repository={'path': 'repo'},
			
 
				+        archive='archive',
			
 
				+        config={
			
 
				+            'checks': [
			
 
				+                {
			
 
				+                    'name': 'archives',
			
 
				+                    'frequency': '2 weeks',
			
 
				+                },
			
 
				+                {
			
 
				+                    'name': 'spot',
			
 
				+                    'data_sample_percentage': 100,
			
 
				+                },
			
 
				+            ]
			
 
				+        },
			
 
				+        local_borg_version=flexmock(),
			
 
				+        global_arguments=flexmock(),
			
 
				+        local_path=flexmock(),
			
 
				+        remote_path=flexmock(),
			
 
				+        log_label='repo',
			
 
				+        source_paths=('/foo', '/bar', '/baz', '/quux'),
			
 
				+    ) == ('/quux',)
			
 
				+
			
 
				+
			
 
				 def test_spot_check_without_spot_configuration_errors():
			
 
				     with pytest.raises(ValueError):
			
 
				         module.spot_check(