Przeglądaj źródła

Merge pull request #932 from enkore/feature/inorder

create: Visit files in inode order
TW 9 lat temu
rodzic
commit
14bd4c756d
3 zmienionych plików z 132 dodań i 35 usunięć
  1. 58 32
      borg/archiver.py
  2. 66 0
      borg/helpers.py
  3. 8 3
      borg/testsuite/archiver.py

+ 58 - 32
borg/archiver.py

@@ -25,6 +25,7 @@ from .helpers import Error, location_validator, archivename_validator, format_ti
     log_multi, PatternMatcher, ItemFormatter
     log_multi, PatternMatcher, ItemFormatter
 from .logger import create_logger, setup_logging
 from .logger import create_logger, setup_logging
 logger = create_logger()
 logger = create_logger()
+from . import helpers
 from .compress import Compressor, COMPR_BUFFER
 from .compress import Compressor, COMPR_BUFFER
 from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader
 from .upgrader import AtticRepositoryUpgrader, BorgRepositoryUpgrader
 from .repository import Repository
 from .repository import Repository
@@ -247,17 +248,18 @@ class Archiver:
                     self.print_file_status(status, path)
                     self.print_file_status(status, path)
                     continue
                     continue
                 path = os.path.normpath(path)
                 path = os.path.normpath(path)
+                try:
+                    st = os.lstat(path)
+                except OSError as e:
+                    self.print_warning('%s: %s', path, e)
+                    continue
                 if args.one_file_system:
                 if args.one_file_system:
-                    try:
-                        restrict_dev = os.lstat(path).st_dev
-                    except OSError as e:
-                        self.print_warning('%s: %s', path, e)
-                        continue
+                    restrict_dev = st.st_dev
                 else:
                 else:
                     restrict_dev = None
                     restrict_dev = None
                 self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present,
                 self._process(archive, cache, matcher, args.exclude_caches, args.exclude_if_present,
                               args.keep_tag_files, skip_inodes, path, restrict_dev,
                               args.keep_tag_files, skip_inodes, path, restrict_dev,
-                              read_special=args.read_special, dry_run=dry_run)
+                              read_special=args.read_special, dry_run=dry_run, st=st)
             if not dry_run:
             if not dry_run:
                 archive.save(comment=args.comment, timestamp=args.timestamp)
                 archive.save(comment=args.comment, timestamp=args.timestamp)
                 if args.progress:
                 if args.progress:
@@ -292,16 +294,16 @@ class Archiver:
 
 
     def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
     def _process(self, archive, cache, matcher, exclude_caches, exclude_if_present,
                  keep_tag_files, skip_inodes, path, restrict_dev,
                  keep_tag_files, skip_inodes, path, restrict_dev,
-                 read_special=False, dry_run=False):
+                 read_special=False, dry_run=False, st=None):
         if not matcher.match(path):
         if not matcher.match(path):
             self.print_file_status('x', path)
             self.print_file_status('x', path)
             return
             return
-
-        try:
-            st = os.lstat(path)
-        except OSError as e:
-            self.print_warning('%s: %s', path, e)
-            return
+        if st is None:
+            try:
+                st = os.lstat(path)
+            except OSError as e:
+                self.print_warning('%s: %s', path, e)
+                return
         if (st.st_ino, st.st_dev) in skip_inodes:
         if (st.st_ino, st.st_dev) in skip_inodes:
             return
             return
         # Entering a new filesystem?
         # Entering a new filesystem?
@@ -331,15 +333,15 @@ class Archiver:
             if not dry_run:
             if not dry_run:
                 status = archive.process_dir(path, st)
                 status = archive.process_dir(path, st)
             try:
             try:
-                entries = os.listdir(path)
+                entries = helpers.scandir_inorder(path)
             except OSError as e:
             except OSError as e:
                 status = 'E'
                 status = 'E'
                 self.print_warning('%s: %s', path, e)
                 self.print_warning('%s: %s', path, e)
             else:
             else:
-                for filename in sorted(entries):
-                    entry_path = os.path.normpath(os.path.join(path, filename))
+                for dirent in entries:
+                    normpath = os.path.normpath(dirent.path)
                     self._process(archive, cache, matcher, exclude_caches, exclude_if_present,
                     self._process(archive, cache, matcher, exclude_caches, exclude_if_present,
-                                  keep_tag_files, skip_inodes, entry_path, restrict_dev,
+                                  keep_tag_files, skip_inodes, normpath, restrict_dev,
                                   read_special=read_special, dry_run=dry_run)
                                   read_special=read_special, dry_run=dry_run)
         elif stat.S_ISLNK(st.st_mode):
         elif stat.S_ISLNK(st.st_mode):
             if not dry_run:
             if not dry_run:
@@ -461,7 +463,7 @@ class Archiver:
                 return [None]
                 return [None]
 
 
         def has_hardlink_master(item, hardlink_masters):
         def has_hardlink_master(item, hardlink_masters):
-            return item.get(b'source') in hardlink_masters and get_mode(item)[0] != 'l'
+            return stat.S_ISREG(item[b'mode']) and item.get(b'source') in hardlink_masters
 
 
         def compare_link(item1, item2):
         def compare_link(item1, item2):
             # These are the simple link cases. For special cases, e.g. if a
             # These are the simple link cases. For special cases, e.g. if a
@@ -524,9 +526,6 @@ class Archiver:
             """
             """
             changes = []
             changes = []
 
 
-            if item1.get(b'hardlink_master') or item2.get(b'hardlink_master'):
-                hardlink_masters[path] = (item1, item2)
-
             if has_hardlink_master(item1, hardlink_masters):
             if has_hardlink_master(item1, hardlink_masters):
                 item1 = hardlink_masters[item1[b'source']][0]
                 item1 = hardlink_masters[item1[b'source']][0]
 
 
@@ -559,8 +558,26 @@ class Archiver:
             print("{:<19} {}".format(line[1], line[0]))
             print("{:<19} {}".format(line[1], line[0]))
 
 
         def compare_archives(archive1, archive2, matcher):
         def compare_archives(archive1, archive2, matcher):
+            def hardlink_master_seen(item):
+                return b'source' not in item or not stat.S_ISREG(item[b'mode']) or item[b'source'] in hardlink_masters
+
+            def is_hardlink_master(item):
+                return item.get(b'hardlink_master', True) and b'source' not in item
+
+            def update_hardlink_masters(item1, item2):
+                if is_hardlink_master(item1) or is_hardlink_master(item2):
+                    hardlink_masters[item1[b'path']] = (item1, item2)
+
+            def compare_or_defer(item1, item2):
+                update_hardlink_masters(item1, item2)
+                if not hardlink_master_seen(item1) or not hardlink_master_seen(item2):
+                    deferred.append((item1, item2))
+                else:
+                    compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
+
             orphans_archive1 = collections.OrderedDict()
             orphans_archive1 = collections.OrderedDict()
             orphans_archive2 = collections.OrderedDict()
             orphans_archive2 = collections.OrderedDict()
+            deferred = []
             hardlink_masters = {}
             hardlink_masters = {}
             output = []
             output = []
 
 
@@ -569,31 +586,40 @@ class Archiver:
                     archive2.iter_items(lambda item: matcher.match(item[b'path'])),
                     archive2.iter_items(lambda item: matcher.match(item[b'path'])),
             ):
             ):
                 if item1 and item2 and item1[b'path'] == item2[b'path']:
                 if item1 and item2 and item1[b'path'] == item2[b'path']:
-                    compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
+                    compare_or_defer(item1, item2)
                     continue
                     continue
                 if item1:
                 if item1:
                     matching_orphan = orphans_archive2.pop(item1[b'path'], None)
                     matching_orphan = orphans_archive2.pop(item1[b'path'], None)
                     if matching_orphan:
                     if matching_orphan:
-                        compare_items(output, item1[b'path'], item1, matching_orphan, hardlink_masters)
+                        compare_or_defer(item1, matching_orphan)
                     else:
                     else:
                         orphans_archive1[item1[b'path']] = item1
                         orphans_archive1[item1[b'path']] = item1
                 if item2:
                 if item2:
                     matching_orphan = orphans_archive1.pop(item2[b'path'], None)
                     matching_orphan = orphans_archive1.pop(item2[b'path'], None)
                     if matching_orphan:
                     if matching_orphan:
-                        compare_items(output, item2[b'path'], matching_orphan, item2, hardlink_masters)
+                        compare_or_defer(matching_orphan, item2)
                     else:
                     else:
                         orphans_archive2[item2[b'path']] = item2
                         orphans_archive2[item2[b'path']] = item2
             # At this point orphans_* contain items that had no matching partner in the other archive
             # At this point orphans_* contain items that had no matching partner in the other archive
+            deleted_item = {
+                b'deleted': True,
+                b'chunks': [],
+                b'mode': 0,
+            }
             for added in orphans_archive2.values():
             for added in orphans_archive2.values():
-                compare_items(output, added[b'path'], {
-                    b'deleted': True,
-                    b'chunks': [],
-                }, added, hardlink_masters, deleted=True)
+                path = added[b'path']
+                deleted_item[b'path'] = path
+                update_hardlink_masters(deleted_item, added)
+                compare_items(output, path, deleted_item, added, hardlink_masters, deleted=True)
             for deleted in orphans_archive1.values():
             for deleted in orphans_archive1.values():
-                compare_items(output, deleted[b'path'], deleted, {
-                    b'deleted': True,
-                    b'chunks': [],
-                }, hardlink_masters, deleted=True)
+                path = deleted[b'path']
+                deleted_item[b'path'] = path
+                update_hardlink_masters(deleted, deleted_item)
+                compare_items(output, path, deleted, deleted_item, hardlink_masters, deleted=True)
+            for item1, item2 in deferred:
+                assert hardlink_master_seen(item1)
+                assert hardlink_master_seen(item2)
+                compare_items(output, item1[b'path'], item1, item2, hardlink_masters)
 
 
             for line in sorted(output):
             for line in sorted(output):
                 print_output(line)
                 print_output(line)

+ 66 - 0
borg/helpers.py

@@ -6,6 +6,7 @@ import grp
 import hashlib
 import hashlib
 from itertools import islice
 from itertools import islice
 import os
 import os
+import os.path
 import stat
 import stat
 import textwrap
 import textwrap
 import pwd
 import pwd
@@ -1349,3 +1350,68 @@ def consume(iterator, n=None):
     else:
     else:
         # advance to the empty slice starting at position n
         # advance to the empty slice starting at position n
         next(islice(iterator, n, n), None)
         next(islice(iterator, n, n), None)
+
+# GenericDirEntry, scandir_generic (c) 2012 Ben Hoyt
+# from the python-scandir package (3-clause BSD license, just like us, so no troubles here)
+# note: simplified version
+
+
+class GenericDirEntry:
+    __slots__ = ('name', '_scandir_path', '_path')
+
+    def __init__(self, scandir_path, name):
+        self._scandir_path = scandir_path
+        self.name = name
+        self._path = None
+
+    @property
+    def path(self):
+        if self._path is None:
+            self._path = os.path.join(self._scandir_path, self.name)
+        return self._path
+
+    def stat(self, follow_symlinks=True):
+        assert not follow_symlinks
+        return os.lstat(self.path)
+
+    def _check_type(self, type):
+        st = self.stat(False)
+        return stat.S_IFMT(st.st_mode) == type
+
+    def is_dir(self, follow_symlinks=True):
+        assert not follow_symlinks
+        return self._check_type(stat.S_IFDIR)
+
+    def is_file(self, follow_symlinks=True):
+        assert not follow_symlinks
+        return self._check_type(stat.S_IFREG)
+
+    def is_symlink(self):
+        return self._check_type(stat.S_IFLNK)
+
+    def inode(self):
+        st = self.stat(False)
+        return st.st_ino
+
+    def __repr__(self):
+        return '<{0}: {1!r}>'.format(self.__class__.__name__, self.path)
+
+
+def scandir_generic(path='.'):
+    """Like os.listdir(), but yield DirEntry objects instead of returning a list of names."""
+    for name in sorted(os.listdir(path)):
+        yield GenericDirEntry(path, name)
+
+try:
+    from os import scandir
+except ImportError:
+    try:
+        # Try python-scandir on Python 3.4
+        from scandir import scandir
+    except ImportError:
+        # If python-scandir is not installed, then use a version that is just as slow as listdir.
+        scandir = scandir_generic
+
+
+def scandir_inorder(path='.'):
+    return sorted(scandir(path), key=lambda dirent: dirent.inode())

+ 8 - 3
borg/testsuite/archiver.py

@@ -17,7 +17,7 @@ from hashlib import sha256
 
 
 import pytest
 import pytest
 
 
-from .. import xattr
+from .. import xattr, helpers
 from ..archive import Archive, ChunkBuffer, ArchiveRecreater
 from ..archive import Archive, ChunkBuffer, ArchiveRecreater
 from ..archiver import Archiver
 from ..archiver import Archiver
 from ..cache import Cache
 from ..cache import Cache
@@ -1314,11 +1314,16 @@ class ArchiverTestCase(ArchiverTestCaseBase):
         assert 'dir2/abcdef' in files
         assert 'dir2/abcdef' in files
         assert 'file1' not in files
         assert 'file1' not in files
 
 
+    # The _test_create_interrupt requires a deterministic (alphabetic) order of the files to easily check if
+    # resumption works correctly. Patch scandir_inorder to work in alphabetic order.
+
     def test_recreate_interrupt(self):
     def test_recreate_interrupt(self):
-        self._test_recreate_interrupt(False, True)
+        with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic):
+            self._test_recreate_interrupt(False, True)
 
 
     def test_recreate_interrupt2(self):
     def test_recreate_interrupt2(self):
-        self._test_recreate_interrupt(True, False)
+        with patch.object(helpers, 'scandir_inorder', helpers.scandir_generic):
+            self._test_recreate_interrupt(True, False)
 
 
     def _test_recreate_chunker_interrupt_patch(self):
     def _test_recreate_chunker_interrupt_patch(self):
         real_add_chunk = Cache.add_chunk
         real_add_chunk = Cache.add_chunk