浏览代码

Merge pull request #584 from hansmi/shell-pattern

Add shell-style pattern syntax
TW 9 年之前
父节点
当前提交
068c68b24c
共有 6 个文件被更改,包括 292 次插入32 次删除
  1. 39 26
      borg/archiver.py
  2. 26 5
      borg/helpers.py
  3. 62 0
      borg/shellpattern.py
  4. 48 1
      borg/testsuite/helpers.py
  5. 113 0
      borg/testsuite/shellpattern.py
  6. 4 0
      docs/usage.rst

+ 39 - 26
borg/archiver.py

@@ -628,38 +628,50 @@ class Archiver:
 
     helptext = {}
     helptext['patterns'] = textwrap.dedent('''
-        Exclusion patterns support three separate styles, fnmatch, regular
+        Exclusion patterns support four separate styles, fnmatch, shell, regular
         expressions and path prefixes. If followed by a colon (':') the first two
         characters of a pattern are used as a style selector. Explicit style
         selection is necessary when a non-default style is desired or when the
         desired pattern starts with two alphanumeric characters followed by a colon
         (i.e. `aa:something/*`).
 
-        `Fnmatch <https://docs.python.org/3/library/fnmatch.html>`_ patterns use
-        a variant of shell pattern syntax, with '*' matching any number of
-        characters, '?' matching any single character, '[...]' matching any single
-        character specified, including ranges, and '[!...]' matching any character
-        not specified. The style selector is `fm`. For the purpose of these patterns,
-        the path separator ('\\' for Windows and '/' on other systems) is not treated
-        specially. For a path to match a pattern, it must completely match from start
-        to end, or must match from the start to just before a path separator. Except
-        for the root path, paths will never end in the path separator when matching
-        is attempted. Thus, if a given pattern ends in a path separator, a '*' is
-        appended before matching is attempted.
-
-        Regular expressions similar to those found in Perl are supported with the
-        selection prefix `re:`. Unlike shell patterns regular expressions are not
-        required to match the complete path and any substring match is sufficient. It
-        is strongly recommended to anchor patterns to the start ('^'), to the end
-        ('$') or both. Path separators ('\\' for Windows and '/' on other systems) in
-        paths are always normalized to a forward slash ('/') before applying
-        a pattern. The regular expression syntax is described in the `Python
-        documentation for the re module
-        <https://docs.python.org/3/library/re.html>`_.
-
-        Prefix path patterns can be selected with the prefix `pp:`. This pattern
-        style is useful to match whole sub-directories. The pattern `pp:/data/bar`
-        matches `/data/bar` and everything therein.
+        `Fnmatch <https://docs.python.org/3/library/fnmatch.html>`_, selector `fm:`
+
+            These patterns use a variant of shell pattern syntax, with '*' matching
+            any number of characters, '?' matching any single character, '[...]'
+            matching any single character specified, including ranges, and '[!...]'
+            matching any character not specified. For the purpose of these patterns,
+            the path separator ('\\' for Windows and '/' on other systems) is not
+            treated specially. Wrap meta-characters in brackets for a literal match
+            (i.e. `[?]` to match the literal character `?`). For a path to match
+            a pattern, it must completely match from start to end, or must match from
+            the start to just before a path separator. Except for the root path,
+            paths will never end in the path separator when matching is attempted.
+            Thus, if a given pattern ends in a path separator, a '*' is appended
+            before matching is attempted.
+
+        Shell-style patterns, selector `sh:`
+
+            Like fnmatch patterns these are similar to shell patterns. The difference
+            is that the pattern may include `**/` for matching zero or more directory
+            levels, `*` for matching zero or more arbitrary characters with the
+            exception of any path separator.
+
+        Regular expressions, selector `re:`
+
+            Regular expressions similar to those found in Perl are supported. Unlike
+            shell patterns regular expressions are not required to match the complete
+            path and any substring match is sufficient. It is strongly recommended to
+            anchor patterns to the start ('^'), to the end ('$') or both. Path
+            separators ('\\' for Windows and '/' on other systems) in paths are
+            always normalized to a forward slash ('/') before applying a pattern. The
+            regular expression syntax is described in the `Python documentation for
+            the re module <https://docs.python.org/3/library/re.html>`_.
+
+        Prefix path, selector `pp:`
+
+            This pattern style is useful to match whole sub-directories. The pattern
+            `pp:/data/bar` matches `/data/bar` and everything therein.
 
         Exclusions can be passed via the command line option `--exclude`. When used
         from within a shell the patterns should be quoted to protect them from
@@ -698,6 +710,7 @@ class Archiver:
         *.tmp
         fm:aa:something/*
         re:^/home/[^/]\.tmp/
+        sh:/home/*/.thumbnails
         EOF
         $ borg create --exclude-from exclude.txt backup /
         ''')

+ 26 - 5
borg/helpers.py

@@ -30,6 +30,7 @@ from . import __version__ as borg_version
 from . import hashindex
 from . import chunker
 from . import crypto
+from . import shellpattern
 import msgpack
 import msgpack.fallback
 
@@ -332,11 +333,9 @@ class PatternBase:
         raise NotImplementedError
 
 
-# For both PathPrefixPattern and FnmatchPattern, we require that
-# the pattern either match the whole path or an initial segment
-# of the path up to but not including a path separator.  To
-# unify the two cases, we add a path separator to the end of
-# the path before matching.
+# For PathPrefixPattern, FnmatchPattern and ShellPattern, we require that the pattern either match the whole path
+# or an initial segment of the path up to but not including a path separator. To unify the two cases, we add a path
+# separator to the end of the path before matching.
 
 
 class PathPrefixPattern(PatternBase):
@@ -376,6 +375,27 @@ class FnmatchPattern(PatternBase):
         return (self.regex.match(path + os.path.sep) is not None)
 
 
+class ShellPattern(PatternBase):
+    """Shell glob patterns to exclude.  A trailing slash means to
+    exclude the contents of a directory, but not the directory itself.
+    """
+    PREFIX = "sh"
+
+    def _prepare(self, pattern):
+        sep = os.path.sep
+
+        if pattern.endswith(sep):
+            pattern = os.path.normpath(pattern).rstrip(sep) + sep + "**" + sep + "*" + sep
+        else:
+            pattern = os.path.normpath(pattern) + sep + "**" + sep + "*"
+
+        self.pattern = pattern
+        self.regex = re.compile(shellpattern.translate(self.pattern))
+
+    def _match(self, path):
+        return (self.regex.match(path + os.path.sep) is not None)
+
+
 class RegexPattern(PatternBase):
     """Regular expression to exclude.
     """
@@ -397,6 +417,7 @@ _PATTERN_STYLES = set([
     FnmatchPattern,
     PathPrefixPattern,
     RegexPattern,
+    ShellPattern,
 ])
 
 _PATTERN_STYLE_BY_PREFIX = dict((i.PREFIX, i) for i in _PATTERN_STYLES)

+ 62 - 0
borg/shellpattern.py

@@ -0,0 +1,62 @@
+import re
+import os
+
+
+def translate(pat):
+    """Translate a shell-style pattern to a regular expression.
+
+    The pattern may include "**<sep>" (<sep> stands for the platform-specific path separator; "/" on POSIX systems) for
+    matching zero or more directory levels and "*" for matching zero or more arbitrary characters with the exception of
+    any path separator. Wrap meta-characters in brackets for a literal match (i.e. "[?]" to match the literal character
+    "?").
+
+    This function is derived from the "fnmatch" module distributed with the Python standard library.
+
+    Copyright (C) 2001-2016 Python Software Foundation. All rights reserved.
+
+    TODO: support {alt1,alt2} shell-style alternatives
+
+    """
+    sep = os.path.sep
+    n = len(pat)
+    i = 0
+    res = ""
+
+    while i < n:
+        c = pat[i]
+        i += 1
+
+        if c == "*":
+            if i + 1 < n and pat[i] == "*" and pat[i + 1] == sep:
+                # **/ == wildcard for 0+ full (relative) directory names with trailing slashes; the forward slash stands
+                # for the platform-specific path separator
+                res += r"(?:[^\%s]*\%s)*" % (sep, sep)
+                i += 2
+            else:
+                # * == wildcard for name parts (does not cross path separator)
+                res += r"[^\%s]*" % sep
+        elif c == "?":
+            # ? == any single character excluding path separator
+            res += r"[^\%s]" % sep
+        elif c == "[":
+            j = i
+            if j < n and pat[j] == "!":
+                j += 1
+            if j < n and pat[j] == "]":
+                j += 1
+            while j < n and pat[j] != "]":
+                j += 1
+            if j >= n:
+                res += "\\["
+            else:
+                stuff = pat[i:j].replace("\\", "\\\\")
+                i = j + 1
+                if stuff[0] == "!":
+                    stuff = "^" + stuff[1:]
+                elif stuff[0] == "^":
+                    stuff = "\\" + stuff
+                res += "[%s]" % stuff
+        else:
+            res += re.escape(c)
+
+    return res + r"\Z(?ms)"

+ 48 - 1
borg/testsuite/helpers.py

@@ -12,7 +12,8 @@ import msgpack.fallback
 from ..helpers import Location, format_file_size, format_timedelta, PathPrefixPattern, FnmatchPattern, make_path_safe, \
     prune_within, prune_split, get_cache_dir, Statistics, is_slow_msgpack, yes, RegexPattern, \
     StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams, \
-    ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher
+    ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, PatternMatcher, \
+    ShellPattern
 from . import BaseTestCase, environment_variable, FakeInputs
 
 
@@ -234,6 +235,45 @@ def test_patterns_fnmatch(pattern, expected):
     check_patterns(files, FnmatchPattern(pattern), expected)
 
 
+@pytest.mark.parametrize("pattern, expected", [
+    # "None" means all files, i.e. all match the given pattern
+    ("*", None),
+    ("**/*", None),
+    ("/**/*", None),
+    ("/./*", None),
+    ("*/*", None),
+    ("*///*", None),
+    ("/home/u", []),
+    ("/home/*",
+     ["/home/user/.profile", "/home/user/.bashrc", "/home/user2/.profile", "/home/user2/public_html/index.html",
+      "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails"]),
+    ("/home/user/*", ["/home/user/.profile", "/home/user/.bashrc"]),
+    ("/etc/*/*", ["/etc/server/config", "/etc/server/hosts"]),
+    ("/etc/**/*", ["/etc/server/config", "/etc/server/hosts"]),
+    ("/etc/**/*/*", ["/etc/server/config", "/etc/server/hosts"]),
+    ("*/.pr????e", []),
+    ("**/.pr????e", ["/home/user/.profile", "/home/user2/.profile"]),
+    ("///etc//////*", ["/etc/server/config", "/etc/server/hosts"]),
+    ("/./home//..//home/user2/", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
+    ("/./home//..//home/user2/**/*", ["/home/user2/.profile", "/home/user2/public_html/index.html"]),
+    ("/srv*/", ["/srv/messages", "/srv/dmesg", "/srv2/blafasel"]),
+    ("/srv*", ["/srv", "/srv/messages", "/srv/dmesg", "/srv2", "/srv2/blafasel"]),
+    ("/srv/*", ["/srv/messages", "/srv/dmesg"]),
+    ("/srv2/**", ["/srv2", "/srv2/blafasel"]),
+    ("/srv2/**/", ["/srv2/blafasel"]),
+    ("/home/*/.thumbnails", ["/home/foo/.thumbnails"]),
+    ("/home/*/*/.thumbnails", ["/home/foo/bar/.thumbnails"]),
+    ])
+def test_patterns_shell(pattern, expected):
+    files = [
+        "/etc/server/config", "/etc/server/hosts", "/home", "/home/user/.profile", "/home/user/.bashrc",
+        "/home/user2/.profile", "/home/user2/public_html/index.html", "/srv", "/srv/messages", "/srv/dmesg",
+        "/srv2", "/srv2/blafasel", "/home/foo/.thumbnails", "/home/foo/bar/.thumbnails",
+    ]
+
+    check_patterns(files, ShellPattern(pattern), expected)
+
+
 @pytest.mark.parametrize("pattern, expected", [
     # "None" means all files, i.e. all match the given pattern
     ("", None),
@@ -276,6 +316,7 @@ def _make_test_patterns(pattern):
     return [PathPrefixPattern(pattern),
             FnmatchPattern(pattern),
             RegexPattern("^{}/foo$".format(pattern)),
+            ShellPattern(pattern),
             ]
 
 
@@ -374,6 +415,12 @@ def test_patterns_from_file(tmpdir, lines, expected):
     ("pp:/", PathPrefixPattern),
     ("pp:/data/", PathPrefixPattern),
     ("pp:pp:/data/", PathPrefixPattern),
+
+    # Shell-pattern style
+    ("sh:", ShellPattern),
+    ("sh:*", ShellPattern),
+    ("sh:/data/*", ShellPattern),
+    ("sh:sh:/data/*", ShellPattern),
     ])
 def test_parse_pattern(pattern, cls):
     assert isinstance(parse_pattern(pattern), cls)

+ 113 - 0
borg/testsuite/shellpattern.py

@@ -0,0 +1,113 @@
+import re
+
+import pytest
+
+from .. import shellpattern
+
+
+def check(path, pattern):
+    compiled = re.compile(shellpattern.translate(pattern))
+
+    return bool(compiled.match(path))
+
+
+@pytest.mark.parametrize("path, patterns", [
+    # Literal string
+    ("foo/bar", ["foo/bar"]),
+    ("foo\\bar", ["foo\\bar"]),
+
+    # Non-ASCII
+    ("foo/c/\u0152/e/bar", ["foo/*/\u0152/*/bar", "*/*/\u0152/*/*", "**/\u0152/*/*"]),
+    ("\u00e4\u00f6\u00dc", ["???", "*", "\u00e4\u00f6\u00dc", "[\u00e4][\u00f6][\u00dc]"]),
+
+    # Question mark
+    ("foo", ["fo?"]),
+    ("foo", ["f?o"]),
+    ("foo", ["f??"]),
+    ("foo", ["?oo"]),
+    ("foo", ["?o?"]),
+    ("foo", ["??o"]),
+    ("foo", ["???"]),
+
+    # Single asterisk
+    ("", ["*"]),
+    ("foo", ["*", "**", "***"]),
+    ("foo", ["foo*"]),
+    ("foobar", ["foo*"]),
+    ("foobar", ["foo*bar"]),
+    ("foobarbaz", ["foo*baz"]),
+    ("bar", ["*bar"]),
+    ("foobar", ["*bar"]),
+    ("foo/bar", ["foo/*bar"]),
+    ("foo/bar", ["foo/*ar"]),
+    ("foo/bar", ["foo/*r"]),
+    ("foo/bar", ["foo/*"]),
+    ("foo/bar", ["foo*/bar"]),
+    ("foo/bar", ["fo*/bar"]),
+    ("foo/bar", ["f*/bar"]),
+    ("foo/bar", ["*/bar"]),
+
+    # Double asterisk (matches 0..n directory layers)
+    ("foo/bar", ["foo/**/bar"]),
+    ("foo/1/bar", ["foo/**/bar"]),
+    ("foo/1/22/333/bar", ["foo/**/bar"]),
+    ("foo/", ["foo/**/"]),
+    ("foo/1/", ["foo/**/"]),
+    ("foo/1/22/333/", ["foo/**/"]),
+    ("bar", ["**/bar"]),
+    ("1/bar", ["**/bar"]),
+    ("1/22/333/bar", ["**/bar"]),
+    ("foo/bar/baz", ["foo/**/*"]),
+
+    # Set
+    ("foo1", ["foo[12]"]),
+    ("foo2", ["foo[12]"]),
+    ("foo2/bar", ["foo[12]/*"]),
+    ("f??f", ["f??f", "f[?][?]f"]),
+    ("foo]", ["foo[]]"]),
+
+    # Inverted set
+    ("foo3", ["foo[!12]"]),
+    ("foo^", ["foo[^!]"]),
+    ("foo!", ["foo[^!]"]),
+    ])
+def test_match(path, patterns):
+    for p in patterns:
+        assert check(path, p)
+
+
+@pytest.mark.parametrize("path, patterns", [
+    ("", ["?", "[]"]),
+    ("foo", ["foo?"]),
+    ("foo", ["?foo"]),
+    ("foo", ["f?oo"]),
+
+    # do not match path separator
+    ("foo/ar", ["foo?ar"]),
+
+    # do not match/cross over os.path.sep
+    ("foo/bar", ["*"]),
+    ("foo/bar", ["foo*bar"]),
+    ("foo/bar", ["foo*ar"]),
+    ("foo/bar", ["fo*bar"]),
+    ("foo/bar", ["fo*ar"]),
+
+    # Double asterisk
+    ("foobar", ["foo/**/bar"]),
+
+    # Two asterisks without slash do not match directory separator
+    ("foo/bar", ["**"]),
+
+    # Double asterisk not matching filename
+    ("foo/bar", ["**/"]),
+
+    # Set
+    ("foo3", ["foo[12]"]),
+
+    # Inverted set
+    ("foo1", ["foo[!12]"]),
+    ("foo2", ["foo[!12]"]),
+    ])
+def test_mismatch(path, patterns):
+    for p in patterns:
+        assert not check(path, p)

+ 4 - 0
docs/usage.rst

@@ -240,6 +240,10 @@ Examples
     $ borg create /mnt/backup::my-files /home \
         --exclude 're:^/home/[^/]+/\.thumbnails/'
 
+    # Do the same using a shell-style pattern
+    $ borg create /mnt/backup::my-files /home \
+        --exclude 'sh:/home/*/.thumbnails'
+
     # Backup the root filesystem into an archive named "root-YYYY-MM-DD"
     # use zlib compression (good, but slow) - default is no compression
     NAME="root-`date +%Y-%m-%d`"