Browse Source

Merge pull request #575 from hansmi/extract-pattern-support

Extract pattern support
TW 9 years ago
parent
commit
1e1812c261
5 changed files with 229 additions and 91 deletions
  1. 1 0
      AUTHORS
  2. 39 18
      borg/archiver.py
  3. 38 24
      borg/helpers.py
  4. 33 0
      borg/testsuite/archiver.py
  5. 118 49
      borg/testsuite/helpers.py

+ 1 - 0
AUTHORS

@@ -5,6 +5,7 @@ Borg Contributors ("The Borg Collective")
 - Antoine Beaupré <anarcat@debian.org>
 - Radek Podgorny <radek@podgorny.cz>
 - Yuri D'Elia
+- Michael Hanselmann <public@hansmi.ch>
 
 Borg is a fork of Attic.
 

+ 39 - 18
borg/archiver.py

@@ -17,11 +17,11 @@ import traceback
 
 from . import __version__
 from .helpers import Error, location_validator, format_time, format_file_size, \
-    format_file_mode, parse_pattern, PathPrefixPattern, exclude_path, adjust_patterns, to_localtime, timestamp, \
+    format_file_mode, parse_pattern, PathPrefixPattern, to_localtime, timestamp, \
     get_cache_dir, get_keys_dir, prune_within, prune_split, unhexlify, \
     Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \
     dir_is_tagged, bigint_to_int, ChunkerParams, CompressionSpec, is_slow_msgpack, yes, sysinfo, \
-    EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, log_multi
+    EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR, log_multi, PatternMatcher
 from .logger import create_logger, setup_logging
 logger = create_logger()
 from .compress import Compressor, COMPR_BUFFER
@@ -129,6 +129,10 @@ class Archiver:
 
     def do_create(self, args):
         """Create new archive"""
+        matcher = PatternMatcher(fallback=True)
+        if args.excludes:
+            matcher.add(args.excludes, False)
+
         def create_inner(archive, cache):
             # Add cache dir to inode_skip list
             skip_inodes = set()
@@ -166,7 +170,7 @@ class Archiver:
                         continue
                 else:
                     restrict_dev = None
-                self._process(archive, cache, args.excludes, args.exclude_caches, args.exclude_if_present,
+                self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present,
                               args.keep_tag_files, skip_inodes, path, restrict_dev,
                               read_special=args.read_special, dry_run=dry_run)
             if not dry_run:
@@ -202,11 +206,12 @@ class Archiver:
             create_inner(None, None)
         return self.exit_code
 
-    def _process(self, archive, cache, excludes, exclude_caches, exclude_if_present,
+    def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
                  keep_tag_files, skip_inodes, path, restrict_dev,
                  read_special=False, dry_run=False):
-        if exclude_path(path, excludes):
+        if not matcher.match(path):
             return
+
         try:
             st = os.lstat(path)
         except OSError as e:
@@ -235,7 +240,7 @@ class Archiver:
                 if keep_tag_files and not dry_run:
                     archive.process_dir(path, st)
                     for tag_path in tag_paths:
-                        self._process(archive, cache, excludes, exclude_caches, exclude_if_present,
+                        self._process(archive, cache, matcher, exclude_caches, exclude_if_present,
                                       keep_tag_files, skip_inodes, tag_path, restrict_dev,
                                       read_special=read_special, dry_run=dry_run)
                 return
@@ -249,7 +254,7 @@ class Archiver:
             else:
                 for filename in sorted(entries):
                     entry_path = os.path.normpath(os.path.join(path, filename))
-                    self._process(archive, cache, excludes, exclude_caches, exclude_if_present,
+                    self._process(archive, cache, matcher, exclude_caches, exclude_if_present,
                                   keep_tag_files, skip_inodes, entry_path, restrict_dev,
                                   read_special=read_special, dry_run=dry_run)
         elif stat.S_ISLNK(st.st_mode):
@@ -286,13 +291,25 @@ class Archiver:
         manifest, key = Manifest.load(repository)
         archive = Archive(repository, key, manifest, args.location.archive,
                           numeric_owner=args.numeric_owner)
-        patterns = adjust_patterns(args.paths, args.excludes)
+
+        matcher = PatternMatcher()
+        if args.excludes:
+            matcher.add(args.excludes, False)
+
+        include_patterns = []
+
+        if args.paths:
+            include_patterns.extend(parse_pattern(i, PathPrefixPattern) for i in args.paths)
+            matcher.add(include_patterns, True)
+
+        matcher.fallback = not include_patterns
+
         dry_run = args.dry_run
         stdout = args.stdout
         sparse = args.sparse
         strip_components = args.strip_components
         dirs = []
-        for item in archive.iter_items(lambda item: not exclude_path(item[b'path'], patterns), preload=True):
+        for item in archive.iter_items(lambda item: matcher.match(item[b'path']), preload=True):
             orig_path = item[b'path']
             if strip_components:
                 item[b'path'] = os.sep.join(orig_path.split(os.sep)[strip_components:])
@@ -317,8 +334,8 @@ class Archiver:
         if not args.dry_run:
             while dirs:
                 archive.extract_item(dirs.pop(-1))
-        for pattern in (patterns or []):
-            if isinstance(pattern, PathPrefixPattern) and pattern.match_count == 0:
+        for pattern in include_patterns:
+            if pattern.match_count == 0:
                 self.print_warning("Include pattern '%s' never matched.", pattern)
         return self.exit_code
 
@@ -611,12 +628,12 @@ class Archiver:
 
     helptext = {}
     helptext['patterns'] = textwrap.dedent('''
-        Exclusion patterns support two separate styles, fnmatch and regular
-        expressions. If followed by a colon (':') the first two characters of
-        a pattern are used as a style selector. Explicit style selection is necessary
-        when regular expressions are desired or when the desired fnmatch pattern
-        starts with two alphanumeric characters followed by a colon (i.e.
-        `aa:something/*`).
+        Exclusion patterns support three separate styles, fnmatch, regular
+        expressions and path prefixes. If followed by a colon (':') the first two
+        characters of a pattern are used as a style selector. Explicit style
+        selection is necessary when a non-default style is desired or when the
+        desired pattern starts with two alphanumeric characters followed by a colon
+        (i.e. `aa:something/*`).
 
         `Fnmatch <https://docs.python.org/3/library/fnmatch.html>`_ patterns use
         a variant of shell pattern syntax, with '*' matching any number of
@@ -640,6 +657,10 @@ class Archiver:
         documentation for the re module
         <https://docs.python.org/3/library/re.html>`_.
 
+        Prefix path patterns can be selected with the prefix `pp:`. This pattern
+        style is useful to match whole sub-directories. The pattern `pp:/data/bar`
+        matches `/data/bar` and everything therein.
+
         Exclusions can be passed via the command line option `--exclude`. When used
         from within a shell the patterns should be quoted to protect them from
         expansion.
@@ -961,7 +982,7 @@ class Archiver:
                                type=location_validator(archive=True),
                                help='archive to extract')
         subparser.add_argument('paths', metavar='PATH', nargs='*', type=str,
-                               help='paths to extract')
+                               help='paths to extract; patterns are supported')
 
         rename_epilog = textwrap.dedent("""
         This command renames an archive in the repository.

+ 38 - 24
borg/helpers.py

@@ -257,21 +257,25 @@ def update_excludes(args):
             file.close()
 
 
-def adjust_patterns(paths, excludes):
-    if paths:
-        return (excludes or []) + [PathPrefixPattern(path) for path in paths] + [FnmatchPattern('*')]
-    else:
-        return excludes
+class PatternMatcher:
+    def __init__(self, fallback=None):
+        self._items = []
 
+        # Value to return from match function when none of the patterns match.
+        self.fallback = fallback
 
-def exclude_path(path, patterns):
-    """Used by create and extract sub-commands to determine
-    whether or not an item should be processed.
-    """
-    for pattern in (patterns or []):
-        if pattern.match(path):
-            return isinstance(pattern, (FnmatchPattern, RegexPattern))
-    return False
+    def add(self, patterns, value):
+        """Add list of patterns to internal list. The given value is returned from the match function when one of the
+        given patterns matches.
+        """
+        self._items.extend((i, value) for i in patterns)
+
+    def match(self, path):
+        for (pattern, value) in self._items:
+            if pattern.match(path):
+                return value
+
+        return self.fallback
 
 
 def normalized(func):
@@ -295,6 +299,8 @@ def normalized(func):
 class PatternBase:
     """Shared logic for inclusion/exclusion patterns.
     """
+    PREFIX = NotImplemented
+
     def __init__(self, pattern):
         self.pattern_orig = pattern
         self.match_count = 0
@@ -339,6 +345,8 @@ class PathPrefixPattern(PatternBase):
     If a directory is specified, all paths that start with that
     path match as well.  A trailing slash makes no difference.
     """
+    PREFIX = "pp"
+
     def _prepare(self, pattern):
         self.pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep
 
@@ -350,6 +358,8 @@ class FnmatchPattern(PatternBase):
     """Shell glob patterns to exclude.  A trailing slash means to
     exclude the contents of a directory, but not the directory itself.
     """
+    PREFIX = "fm"
+
     def _prepare(self, pattern):
         if pattern.endswith(os.path.sep):
             pattern = os.path.normpath(pattern).rstrip(os.path.sep) + os.path.sep + '*' + os.path.sep
@@ -369,6 +379,8 @@ class FnmatchPattern(PatternBase):
 class RegexPattern(PatternBase):
     """Regular expression to exclude.
     """
+    PREFIX = "re"
+
     def _prepare(self, pattern):
         self.pattern = pattern
         self.regex = re.compile(pattern)
@@ -381,25 +393,27 @@ class RegexPattern(PatternBase):
         return (self.regex.search(path) is not None)
 
 
-_DEFAULT_PATTERN_STYLE = "fm"
-_PATTERN_STYLES = {
-        "fm": FnmatchPattern,
-        "re": RegexPattern,
-        }
+_PATTERN_STYLES = set([
+    FnmatchPattern,
+    PathPrefixPattern,
+    RegexPattern,
+])
 
+_PATTERN_STYLE_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_STYLES)
 
-def parse_pattern(pattern):
+
+def parse_pattern(pattern, fallback=FnmatchPattern):
     """Read pattern from string and return an instance of the appropriate implementation class.
     """
     if len(pattern) > 2 and pattern[2] == ":" and pattern[:2].isalnum():
         (style, pattern) = (pattern[:2], pattern[3:])
-    else:
-        style = _DEFAULT_PATTERN_STYLE
 
-    cls = _PATTERN_STYLES.get(style, None)
+        cls = _PATTERN_STYLE_BY_PREFIX.get(style, None)
 
-    if cls is None:
-        raise ValueError("Unknown pattern style: {}".format(style))
+        if cls is None:
+            raise ValueError("Unknown pattern style: {}".format(style))
+    else:
+        cls = fallback
 
     return cls(pattern)
 

+ 33 - 0
borg/testsuite/archiver.py

@@ -562,6 +562,39 @@ class ArchiverTestCase(ArchiverTestCaseBase):
             self.cmd('extract', '--exclude-from=' + self.exclude_file_path, self.repository_location + '::test')
         self.assert_equal(sorted(os.listdir('output/input')), ['file3'])
 
+    def test_extract_with_pattern(self):
+        self.cmd("init", self.repository_location)
+        self.create_regular_file("file1", size=1024 * 80)
+        self.create_regular_file("file2", size=1024 * 80)
+        self.create_regular_file("file3", size=1024 * 80)
+        self.create_regular_file("file4", size=1024 * 80)
+        self.create_regular_file("file333", size=1024 * 80)
+
+        self.cmd("create", self.repository_location + "::test", "input")
+
+        # Extract everything with regular expression
+        with changedir("output"):
+            self.cmd("extract", self.repository_location + "::test", "re:.*")
+        self.assert_equal(sorted(os.listdir("output/input")), ["file1", "file2", "file3", "file333", "file4"])
+        shutil.rmtree("output/input")
+
+        # Extract with pattern while also excluding files
+        with changedir("output"):
+            self.cmd("extract", "--exclude=re:file[34]$", self.repository_location + "::test", r"re:file\d$")
+        self.assert_equal(sorted(os.listdir("output/input")), ["file1", "file2"])
+        shutil.rmtree("output/input")
+
+        # Combine --exclude with pattern for extraction
+        with changedir("output"):
+            self.cmd("extract", "--exclude=input/file1", self.repository_location + "::test", "re:file[12]$")
+        self.assert_equal(sorted(os.listdir("output/input")), ["file2"])
+        shutil.rmtree("output/input")
+
+        # Multiple pattern
+        with changedir("output"):
+            self.cmd("extract", self.repository_location + "::test", "fm:input/file1", "fm:*file33*", "input/file2")
+        self.assert_equal(sorted(os.listdir("output/input")), ["file1", "file2", "file333"])
+
     def test_exclude_caches(self):
         self.cmd('init', self.repository_location)
         self.create_regular_file('file1', size=1024 * 80)

+ 118 - 49
borg/testsuite/helpers.py

@@ -9,10 +9,10 @@ import sys
 import msgpack
 import msgpack.fallback
 
-from ..helpers import adjust_patterns, exclude_path, Location, format_file_size, format_timedelta, PathPrefixPattern, FnmatchPattern, make_path_safe, \
+from ..helpers import Location, format_file_size, format_timedelta, PathPrefixPattern, FnmatchPattern, make_path_safe, \
     prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, RegexPattern, \
     StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \
-    ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern
+    ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher
 from . import BaseTestCase, environment_variable, FakeInputs
 
 
@@ -160,70 +160,105 @@ class FormatTimedeltaTestCase(BaseTestCase):
         )
 
 
-def check_patterns(files, paths, excludes, expected):
-    """Utility for testing exclusion patterns.
+def check_patterns(files, pattern, expected):
+    """Utility for testing patterns.
     """
-    patterns = adjust_patterns(paths, excludes)
-    included = [path for path in files if not exclude_path(path, patterns)]
+    assert all([f == os.path.normpath(f) for f in files]), \
+            "Pattern matchers expect normalized input paths"
 
-    assert included == (files if expected is None else expected)
+    matched = [f for f in files if pattern.match(f)]
 
+    assert matched == (files if expected is None else expected)
 
-@pytest.mark.parametrize("paths, excludes, expected", [
-    # "None" means all files, i.e. none excluded
-    ([], [], None),
-    (['/'], [], None),
-    (['/'], ['/h'], None),
-    (['/'], ['/home'], ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg']),
-    (['/'], ['/home/'], ['/etc/passwd', '/etc/hosts', '/home', '/var/log/messages', '/var/log/dmesg']),
-    (['/home/u'], [], []),
-    (['/', '/home', '/etc/hosts'], ['/'], []),
-    (['/home/'], ['/home/user2'], ['/home', '/home/user/.profile', '/home/user/.bashrc']),
-    (['/'], ['*.profile', '/var/log'],
-     ['/etc/passwd', '/etc/hosts', '/home', '/home/user/.bashrc', '/home/user2/public_html/index.html']),
-    (['/'], ['/home/*/public_html', '*.profile', '*/log/*'],
-     ['/etc/passwd', '/etc/hosts', '/home', '/home/user/.bashrc']),
-    (['/etc/', '/var'], ['dmesg'], ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg']),
+
+@pytest.mark.parametrize("pattern, expected", [
+    # "None" means all files, i.e. all match the given pattern
+    ("/", None),
+    ("/./", None),
+    ("", []),
+    ("/home/u", []),
+    ("/home/user", ["/home/user/.profile", "/home/user/.bashrc"]),
+    ("/etc", ["/etc/server/config", "/etc/server/hosts"]),
+    ("///etc//////", ["/etc/server/config", "/etc/server/hosts"]),
+    ("/./home//..//home/user2", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
+    ("/srv", ["/srv/messages", "/srv/dmesg"]),
     ])
-def test_patterns(paths, excludes, expected):
+def test_patterns_prefix(pattern, expected):
     files = [
-        '/etc/passwd', '/etc/hosts', '/home',
-        '/home/user/.profile', '/home/user/.bashrc',
-        '/home/user2/.profile', '/home/user2/public_html/index.html',
-        '/var/log/messages', '/var/log/dmesg',
+        "/etc/server/config", "/etc/server/hosts", "/home", "/home/user/.profile", "/home/user/.bashrc",
+        "/home/user2/.profile", "/home/user2/public_html/index.html", "/srv/messages", "/srv/dmesg",
     ]
 
-    check_patterns(files, paths, [FnmatchPattern(p) for p in excludes], expected)
+    check_patterns(files, PathPrefixPattern(pattern), expected)
 
 
-@pytest.mark.parametrize("paths, excludes, expected", [
-    # "None" means all files, i.e. none excluded
-    ([], [], None),
-    (['/'], [], None),
-    (['/'], ['.*'], []),
-    (['/'], ['^/'], []),
-    (['/'], ['^abc$'], None),
-    (['/'], ['^(?!/home/)'],
-     ['/home/user/.profile', '/home/user/.bashrc', '/home/user2/.profile',
-      '/home/user2/public_html/index.html']),
+@pytest.mark.parametrize("pattern, expected", [
+    # "None" means all files, i.e. all match the given pattern
+    ("", []),
+    ("foo", []),
+    ("relative", ["relative/path1", "relative/two"]),
+    ("more", ["more/relative"]),
+    ])
+def test_patterns_prefix_relative(pattern, expected):
+    files = ["relative/path1", "relative/two", "more/relative"]
+
+    check_patterns(files, PathPrefixPattern(pattern), expected)
+
+
+@pytest.mark.parametrize("pattern, expected", [
+    # "None" means all files, i.e. all match the given pattern
+    ("/*", None),
+    ("/./*", None),
+    ("*", None),
+    ("*/*", None),
+    ("*///*", None),
+    ("/home/u", []),
+    ("/home/*",
+     ["/home/user/.profile", "/home/user/.bashrc", "/home/user2/.profile", "/home/user2/public_html/index.html",
+      "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails"]),
+    ("/home/user/*", ["/home/user/.profile", "/home/user/.bashrc"]),
+    ("/etc/*", ["/etc/server/config", "/etc/server/hosts"]),
+    ("*/.pr????e", ["/home/user/.profile", "/home/user2/.profile"]),
+    ("///etc//////*", ["/etc/server/config", "/etc/server/hosts"]),
+    ("/./home//..//home/user2/*", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
+    ("/srv*", ["/srv/messages", "/srv/dmesg"]),
+    ("/home/*/.thumbnails", ["/home/foo/.thumbnails", "/home/foo/bar/.thumbnails"]),
     ])
-def test_patterns_regex(paths, excludes, expected):
+def test_patterns_fnmatch(pattern, expected):
+    files = [
+        "/etc/server/config", "/etc/server/hosts", "/home", "/home/user/.profile", "/home/user/.bashrc",
+        "/home/user2/.profile", "/home/user2/public_html/index.html", "/srv/messages", "/srv/dmesg",
+        "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails",
+    ]
+
+    check_patterns(files, FnmatchPattern(pattern), expected)
+
+
+@pytest.mark.parametrize("pattern, expected", [
+    # "None" means all files, i.e. all match the given pattern
+    ("", None),
+    (".*", None),
+    ("^/", None),
+    ("^abc$", []),
+    ("^[^/]", []),
+    ("^(?!/srv|/foo|/opt)",
+     ["/home", "/home/user/.profile", "/home/user/.bashrc", "/home/user2/.profile",
+      "/home/user2/public_html/index.html", "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails",]),
+    ])
+def test_patterns_regex(pattern, expected):
     files = [
         '/srv/data', '/foo/bar', '/home',
         '/home/user/.profile', '/home/user/.bashrc',
         '/home/user2/.profile', '/home/user2/public_html/index.html',
         '/opt/log/messages.txt', '/opt/log/dmesg.txt',
+        "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails",
     ]
 
-    patterns = []
-
-    for i in excludes:
-        pat = RegexPattern(i)
-        assert str(pat) == i
-        assert pat.pattern == i
-        patterns.append(pat)
+    obj = RegexPattern(pattern)
+    assert str(obj) == pattern
+    assert obj.pattern == pattern
 
-    check_patterns(files, paths, patterns, expected)
+    check_patterns(files, obj, expected)
 
 
 def test_regex_pattern():
@@ -289,6 +324,10 @@ def test_invalid_unicode_pattern(pattern):
      ["/more/data"]),
     ([r"  re:^\s  "], ["/data/something00.txt", "/more/data", "/home", "/whitespace/end\t"]),
     ([r"  re:\s$  "], ["/data/something00.txt", "/more/data", "/home", " #/wsfoobar", "\tstart/whitespace"]),
+    (["pp:./"], None),
+    (["pp:/"], [" #/wsfoobar", "\tstart/whitespace"]),
+    (["pp:aaabbb"], None),
+    (["pp:/data", "pp: #/", "pp:\tstart", "pp:/whitespace"], ["/more/data", "/home"]),
     ])
 def test_patterns_from_file(tmpdir, lines, expected):
     files = [
@@ -299,8 +338,9 @@ def test_patterns_from_file(tmpdir, lines, expected):
     ]
 
     def evaluate(filename):
-        patterns = load_excludes(open(filename, "rt"))
-        return [path for path in files if not exclude_path(path, patterns)]
+        matcher = PatternMatcher(fallback=True)
+        matcher.add(load_excludes(open(filename, "rt")), False)
+        return [path for path in files if matcher.match(path)]
 
     exclfile = tmpdir.join("exclude.txt")
 
@@ -328,6 +368,12 @@ def test_patterns_from_file(tmpdir, lines, expected):
     ("re:.*", RegexPattern),
     ("re:^/something/", RegexPattern),
     ("re:re:^/something/", RegexPattern),
+
+    # Path prefix
+    ("pp:", PathPrefixPattern),
+    ("pp:/", PathPrefixPattern),
+    ("pp:/data/", PathPrefixPattern),
+    ("pp:pp:/data/", PathPrefixPattern),
     ])
 def test_parse_pattern(pattern, cls):
     assert isinstance(parse_pattern(pattern), cls)
@@ -339,6 +385,29 @@ def test_parse_pattern_error(pattern):
         parse_pattern(pattern)
 
 
+def test_pattern_matcher():
+    pm = PatternMatcher()
+
+    assert pm.fallback is None
+
+    for i in ["", "foo", "bar"]:
+        assert pm.match(i) is None
+
+    pm.add([RegexPattern("^a")], "A")
+    pm.add([RegexPattern("^b"), RegexPattern("^z")], "B")
+    pm.add([RegexPattern("^$")], "Empty")
+    pm.fallback = "FileNotFound"
+
+    assert pm.match("") == "Empty"
+    assert pm.match("aaa") == "A"
+    assert pm.match("bbb") == "B"
+    assert pm.match("ccc") == "FileNotFound"
+    assert pm.match("xyz") == "FileNotFound"
+    assert pm.match("z") == "B"
+
+    assert PatternMatcher(fallback="hey!").fallback == "hey!"
+
+
 def test_compression_specs():
     with pytest.raises(ValueError):
         CompressionSpec('')