Browse Source

Port the parent directory discovery logic from LVM to ZFS (#80).

Dan Helfman 6 months ago
parent
commit
9b77de3d66

+ 4 - 1
borgmatic/hooks/data_source/lvm.py

@@ -50,6 +50,9 @@ def get_logical_volumes(lsblk_command, source_directories=None):
     except json.JSONDecodeError as error:
         raise ValueError('Invalid {lsblk_command} JSON output: {error}')
 
+
+    candidate_source_directories = set(source_directories or ())
+
     try:
         return tuple(
             (device['name'], device['path'], device['mountpoint'], contained_source_directories)
@@ -57,7 +60,7 @@ def get_logical_volumes(lsblk_command, source_directories=None):
             if device['mountpoint'] and device['type'] == 'lvm'
             for contained_source_directories in (
                 borgmatic.hooks.data_source.snapshot.get_contained_directories(
-                    device['mountpoint'], source_directories
+                    device['mountpoint'], candidate_source_directories
                 ),
             )
             if not source_directories or contained_source_directories

+ 13 - 6
borgmatic/hooks/data_source/snapshot.py

@@ -1,3 +1,4 @@
+import itertools
 import pathlib
 
 
@@ -6,18 +7,24 @@ IS_A_HOOK = False
 
 def get_contained_directories(parent_directory, candidate_contained_directories):
     '''
-    Given a parent directory and a sequence of candiate directories potentially inside it, get the
-    subset of contained directories for which the parent directory is actually the parent, a
-    grandparent, the very same directory, etc. The idea is if, say, /var/log and /var/lib are
-    candidate contained directories, but there's a parent directory (logical volume, dataset,
-    subvolume, etc.) at /var, then /var is what we want to snapshot.
+    Given a parent directory and a set of candiate directories potentially inside it, get the subset
+    of contained directories for which the parent directory is actually the parent, a grandparent,
+    the very same directory, etc. The idea is if, say, /var/log and /var/lib are candidate contained
+    directories, but there's a parent directory (logical volume, dataset, subvolume, etc.) at /var,
+    then /var is what we want to snapshot.
+
+    Also mutate the given set of candidate contained directories to remove any actually contained
+    directories from it.
     '''
     if not candidate_contained_directories:
         return ()
 
-    return tuple(
+    contained = tuple(
         candidate
         for candidate in candidate_contained_directories
         if parent_directory == candidate
         or pathlib.PurePosixPath(parent_directory) in pathlib.PurePath(candidate).parents
     )
+    candidate_contained_directories -= set(contained)
+
+    return contained

+ 69 - 31
borgmatic/hooks/data_source/zfs.py

@@ -1,3 +1,4 @@
+import collections
 import glob
 import logging
 import os
@@ -5,6 +6,7 @@ import shutil
 import subprocess
 
 import borgmatic.config.paths
+import borgmatic.hooks.data_source.snapshot
 import borgmatic.execute
 
 logger = logging.getLogger(__name__)
@@ -21,6 +23,9 @@ BORGMATIC_SNAPSHOT_PREFIX = 'borgmatic-'
 BORGMATIC_USER_PROPERTY = 'org.torsion.borgmatic:backup'
 
 
+Dataset = collections.namedtuple('Dataset', ('name', 'mount_point', 'user_property_value', 'contained_source_directories'))
+
+
 def get_datasets_to_backup(zfs_command, source_directories):
     '''
     Given a ZFS command to run and a sequence of configured source directories, find the
@@ -29,7 +34,7 @@ def get_datasets_to_backup(zfs_command, source_directories):
     datasets tagged with a borgmatic-specific user property, whether or not they appear in source
     directories.
 
-    Return the result as a sequence of (dataset name, mount point) pairs.
+    Return the result as a sequence of Dataset instances, sorted by mount point.
     '''
     list_output = borgmatic.execute.execute_command_and_capture_output(
         (
@@ -42,23 +47,44 @@ def get_datasets_to_backup(zfs_command, source_directories):
             f'name,mountpoint,{BORGMATIC_USER_PROPERTY}',
         )
     )
-    source_directories_set = set(source_directories)
 
     try:
-        return tuple(
-            (dataset_name, mount_point)
-            for line in list_output.splitlines()
-            for (dataset_name, mount_point, user_property_value) in (line.rstrip().split('\t'),)
-            if mount_point in source_directories_set or user_property_value == 'auto'
+        # Sort from longest to shortest mount points, so longer mount points get a whack at the
+        # candidate source directory piñata before their parents do. (Source directories are
+        # consumed during the second loop below, so no two datasets get the same contained source
+        # directories.)
+        datasets = sorted(
+            (
+                Dataset(dataset_name, mount_point, user_property_value, ())
+                for line in list_output.splitlines()
+                for (dataset_name, mount_point, user_property_value) in (line.rstrip().split('\t'),)
+            ),
+            key=lambda dataset: dataset.mount_point,
+            reverse=True,
         )
     except ValueError:
         raise ValueError('Invalid {zfs_command} list output')
 
+    candidate_source_directories = set(source_directories)
+
+    return sorted(
+        tuple(
+            Dataset(dataset.name, dataset.mount_point, dataset.user_property_value, contained_source_directories)
+            for dataset in datasets
+            for contained_source_directories in (
+                borgmatic.hooks.data_source.snapshot.get_contained_directories(
+                    dataset.mount_point, candidate_source_directories
+                ),
+            )
+            if contained_source_directories or dataset.user_property_value == 'auto'
+        ),
+        key=lambda dataset: dataset.mount_point,
+    )
+
 
-def get_all_datasets(zfs_command):
+def get_all_dataset_mount_points(zfs_command):
     '''
-    Given a ZFS command to run, return all ZFS datasets as a sequence of (dataset name, mount point)
-    pairs.
+    Given a ZFS command to run, return all ZFS datasets as a sequence of sorted mount points.
     '''
     list_output = borgmatic.execute.execute_command_and_capture_output(
         (
@@ -68,15 +94,13 @@ def get_all_datasets(zfs_command):
             '-t',
             'filesystem',
             '-o',
-            'name,mountpoint',
+            'mountpoint',
         )
     )
 
     try:
         return tuple(
-            (dataset_name, mount_point)
-            for line in list_output.splitlines()
-            for (dataset_name, mount_point) in (line.rstrip().split('\t'),)
+            sorted(line.rstrip() for line in list_output.splitlines())
         )
     except ValueError:
         raise ValueError('Invalid {zfs_command} list output')
@@ -147,39 +171,51 @@ def dump_data_sources(
 
     # Snapshot each dataset, rewriting source directories to use the snapshot paths.
     snapshot_name = f'{BORGMATIC_SNAPSHOT_PREFIX}{os.getpid()}'
+    normalized_runtime_directory = os.path.normpath(borgmatic_runtime_directory)
 
     if not requested_datasets:
         logger.warning(f'{log_prefix}: No ZFS datasets found to snapshot{dry_run_label}')
 
-    for dataset_name, mount_point in requested_datasets:
-        full_snapshot_name = f'{dataset_name}@{snapshot_name}'
-        logger.debug(f'{log_prefix}: Creating ZFS snapshot {full_snapshot_name}{dry_run_label}')
+    for dataset in requested_datasets:
+        full_snapshot_name = f'{dataset.name}@{snapshot_name}'
+        logger.debug(f'{log_prefix}: Creating ZFS snapshot {full_snapshot_name} of {dataset.mount_point}{dry_run_label}')
 
         if not dry_run:
             snapshot_dataset(zfs_command, full_snapshot_name)
 
         # Mount the snapshot into a particular named temporary directory so that the snapshot ends
         # up in the Borg archive at the "original" dataset mount point path.
-        snapshot_mount_path_for_borg = os.path.join(
-            os.path.normpath(borgmatic_runtime_directory),
+        snapshot_mount_path = os.path.join(
+            normalized_runtime_directory,
             'zfs_snapshots',
-            '.',  # Borg 1.4+ "slashdot" hack.
-            mount_point.lstrip(os.path.sep),
+            dataset.mount_point.lstrip(os.path.sep),
         )
-        snapshot_mount_path = os.path.normpath(snapshot_mount_path_for_borg)
+
         logger.debug(
             f'{log_prefix}: Mounting ZFS snapshot {full_snapshot_name} at {snapshot_mount_path}{dry_run_label}'
         )
 
-        if not dry_run:
-            mount_snapshot(
-                hook_config.get('mount_command', 'mount'), full_snapshot_name, snapshot_mount_path
-            )
+        if dry_run:
+            continue
 
-            if mount_point in source_directories:
-                source_directories.remove(mount_point)
+        mount_snapshot(
+            hook_config.get('mount_command', 'mount'), full_snapshot_name, snapshot_mount_path
+        )
 
-            source_directories.append(snapshot_mount_path_for_borg)
+        for source_directory in dataset.contained_source_directories:
+            try:
+                source_directories.remove(source_directory)
+            except ValueError:
+                pass
+
+            source_directories.append(
+                os.path.join(
+                    normalized_runtime_directory,
+                    'zfs_snapshots',
+                    '.',  # Borg 1.4+ "slashdot" hack.
+                    source_directory.lstrip(os.path.sep),
+                )
+            )
 
     return []
 
@@ -245,7 +281,7 @@ def remove_data_source_dumps(hook_config, config, log_prefix, borgmatic_runtime_
     zfs_command = hook_config.get('zfs_command', 'zfs')
 
     try:
-        datasets = get_all_datasets(zfs_command)
+        dataset_mount_points = get_all_dataset_mount_points(zfs_command)
     except FileNotFoundError:
         logger.debug(f'{log_prefix}: Could not find "{zfs_command}" command')
         return
@@ -275,7 +311,9 @@ def remove_data_source_dumps(hook_config, config, log_prefix, borgmatic_runtime_
         if not dry_run:
             shutil.rmtree(snapshots_directory, ignore_errors=True)
 
-        for _, mount_point in datasets:
+        # Reversing the sorted datasets ensures that we unmount the longer mount point paths of
+        # child datasets before the shorter mount point paths of parent datasets.
+        for mount_point in reversed(dataset_mount_points):
             snapshot_mount_path = os.path.join(snapshots_directory, mount_point.lstrip(os.path.sep))
             if not os.path.isdir(snapshot_mount_path):
                 continue

+ 8 - 0
docs/how-to/snapshot-your-filesystems.md

@@ -76,6 +76,14 @@ in an archive at `/var/dataset` as well—even if borgmatic has to mount the
 snapshot somewhere in `/run/user/1000/borgmatic/zfs_snapshots/` to perform the
 backup.
 
+<span class="minilink minilink-addedin">New in version 1.9.4</span> borgmatic
+is smart enough to look at the parent (and grandparent, etc.) directories of
+each of your `source_directories` to discover any datasets. For instance,
+let's say you add `/var/log` and `/var/lib` to your source directories, but
+`/var` is a dataset. borgmatic will discover that and snapshot `/var`
+accordingly. This also works even with nested datasets; borgmatic selects
+the dataset that's the "closest" parent to your source directories.
+
 <span class="minilink minilink-addedin">With Borg version 1.2 and
 earlier</span>Snapshotted files are instead stored at a path dependent on the
 [runtime