Selaa lähdekoodia

Merge pull request #193 from edgewood/osxPathNormalization

Normalize paths before pattern matching on OS X
TW 9 vuotta sitten
vanhempi
sitoutus
638204fd0e
2 muutettua tiedostoa jossa 97 lisäystä ja 1 poistoa
  1. 29 0
      borg/helpers.py
  2. 68 1
      borg/testsuite/helpers.py

+ 29 - 0
borg/helpers.py

@@ -1,12 +1,15 @@
 import argparse
 import argparse
 import binascii
 import binascii
 from collections import namedtuple
 from collections import namedtuple
+from functools import wraps
 import grp
 import grp
 import os
 import os
 import pwd
 import pwd
 import re
 import re
 import sys
 import sys
 import time
 import time
+import unicodedata
+
 from datetime import datetime, timezone, timedelta
 from datetime import datetime, timezone, timedelta
 from fnmatch import translate
 from fnmatch import translate
 from operator import attrgetter
 from operator import attrgetter
@@ -220,6 +223,23 @@ def exclude_path(path, patterns):
 # unify the two cases, we add a path separator to the end of
 # unify the two cases, we add a path separator to the end of
 # the path before matching.
 # the path before matching.
 
 
+def normalized(func):
+    """ Decorator for the Pattern match methods, returning a wrapper that
+    normalizes OSX paths to match the normalized pattern on OSX, and 
+    returning the original method on other platforms"""
+    @wraps(func)
+    def normalize_wrapper(self, path):
+        return func(self, unicodedata.normalize("NFD", path))
+
+    if sys.platform in ('darwin',):
+        # HFS+ converts paths to a canonical form, so users shouldn't be
+        # required to enter an exact match
+        return normalize_wrapper
+    else:
+        # Windows and Unix filesystems allow different forms, so users
+        # always have to enter an exact match
+        return func
+
 class IncludePattern:
 class IncludePattern:
     """Literal files or directories listed on the command line
     """Literal files or directories listed on the command line
     for some operations (e.g. extract, but not create).
     for some operations (e.g. extract, but not create).
@@ -227,8 +247,12 @@ class IncludePattern:
     path match as well.  A trailing slash makes no difference.
     path match as well.  A trailing slash makes no difference.
     """
     """
     def __init__(self, pattern):
     def __init__(self, pattern):
+        if sys.platform in ('darwin',):
+            pattern = unicodedata.normalize("NFD", pattern)
+
         self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
         self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
 
 
+    @normalized
     def match(self, path):
     def match(self, path):
         return (path+os.path.sep).startswith(self.pattern)
         return (path+os.path.sep).startswith(self.pattern)
 
 
@@ -245,10 +269,15 @@ class ExcludePattern(IncludePattern):
             self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep
             self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep
         else:
         else:
             self.pattern = os.path.normpath(pattern)+os.path.sep+'*'
             self.pattern = os.path.normpath(pattern)+os.path.sep+'*'
+
+        if sys.platform in ('darwin',):
+            self.pattern = unicodedata.normalize("NFD", self.pattern)
+
         # fnmatch and re.match both cache compiled regular expressions.
         # fnmatch and re.match both cache compiled regular expressions.
         # Nevertheless, this is about 10 times faster.
         # Nevertheless, this is about 10 times faster.
         self.regex = re.compile(translate(self.pattern))
         self.regex = re.compile(translate(self.pattern))
 
 
+    @normalized
     def match(self, path):
     def match(self, path):
         return self.regex.match(path+os.path.sep) is not None
         return self.regex.match(path+os.path.sep) is not None
 
 

+ 68 - 1
borg/testsuite/helpers.py

@@ -3,9 +3,10 @@ from time import mktime, strptime
 from datetime import datetime, timezone, timedelta
 from datetime import datetime, timezone, timedelta
 
 
 import pytest
 import pytest
+import sys
 import msgpack
 import msgpack
 
 
-from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \
+from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \
     prune_within, prune_split, \
     prune_within, prune_split, \
     StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams
     StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams
 from . import BaseTestCase
 from . import BaseTestCase
@@ -178,6 +179,72 @@ class PatternTestCase(BaseTestCase):
                           ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])
                           ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])
 
 
 
 
+@pytest.mark.skipif(sys.platform in ('darwin',), reason='all but OS X test')
+class PatternNonAsciiTestCase(BaseTestCase):
+    def testComposedUnicode(self):
+        pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+
+    def testDecomposedUnicode(self):
+        pattern = 'ba\N{COMBINING ACUTE ACCENT}'
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testInvalidUnicode(self):
+        pattern = str(b'ba\x80', 'latin1')
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert not i.match("ba/foo")
+        assert i.match(str(b"ba\x80/foo", 'latin1'))
+        assert not e.match("ba/foo")
+        assert e.match(str(b"ba\x80/foo", 'latin1'))
+
+
+@pytest.mark.skipif(sys.platform not in ('darwin',), reason='OS X test')
+class OSXPatternNormalizationTestCase(BaseTestCase):
+    def testComposedUnicode(self):
+        pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testDecomposedUnicode(self):
+        pattern = 'ba\N{COMBINING ACUTE ACCENT}'
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testInvalidUnicode(self):
+        pattern = str(b'ba\x80', 'latin1')
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert not i.match("ba/foo")
+        assert i.match(str(b"ba\x80/foo", 'latin1'))
+        assert not e.match("ba/foo")
+        assert e.match(str(b"ba\x80/foo", 'latin1'))
+
+
 def test_compression_specs():
 def test_compression_specs():
     with pytest.raises(ValueError):
     with pytest.raises(ValueError):
         CompressionSpec('')
         CompressionSpec('')