Explorar o código

Add tests for stale lock killing and platform.process_alive

Marian Beermann %!s(int64=8) %!d(string=hai) anos
pai
achega
cc14975f2d
Modificáronse 4 ficheiros con 116 adicións e 27 borrados
  1. 11 12
      src/borg/locking.py
  2. 10 5
      src/borg/platform/posix.pyx
  3. 73 10
      src/borg/testsuite/locking.py
  4. 22 0
      src/borg/testsuite/platform.py

+ 11 - 12
src/borg/locking.py

@@ -130,8 +130,7 @@ class ExclusiveLock:
             except FileExistsError:  # already locked
             except FileExistsError:  # already locked
                 if self.by_me():
                 if self.by_me():
                     return self
                     return self
-                if self.kill_stale_lock():
-                    pass
+                self.kill_stale_lock()
                 if timer.timed_out_or_sleep():
                 if timer.timed_out_or_sleep():
                     raise LockTimeout(self.path)
                     raise LockTimeout(self.path)
             except OSError as err:
             except OSError as err:
@@ -167,7 +166,7 @@ class ExclusiveLock:
                 # It's safer to just exit
                 # It's safer to just exit
                 return False
                 return False
 
 
-            if not platform.process_alive(host, pid, thread):
+            if platform.process_alive(host, pid, thread):
                 return False
                 return False
 
 
             if not self.ok_to_kill_stale_locks:
             if not self.ok_to_kill_stale_locks:
@@ -224,17 +223,17 @@ class LockRoster:
             # Just nuke the stale locks early on load
             # Just nuke the stale locks early on load
             if self.ok_to_kill_zombie_locks:
             if self.ok_to_kill_zombie_locks:
                 for key in (SHARED, EXCLUSIVE):
                 for key in (SHARED, EXCLUSIVE):
-                    elements = set()
                     try:
                     try:
-                        for e in data[key]:
-                            (host, pid, thread) = e
-                            if not platform.process_alive(host, pid, thread):
-                                elements.add(tuple(e))
-                            else:
-                                logger.warning('Removed stale %s roster lock for pid %d.', key, pid)
-                        data[key] = list(list(e) for e in elements)
+                        entries = data[key]
                     except KeyError:
                     except KeyError:
-                        pass
+                        continue
+                    elements = set()
+                    for host, pid, thread in entries:
+                        if platform.process_alive(host, pid, thread):
+                            elements.add((host, pid, thread))
+                        else:
+                            logger.warning('Removed stale %s roster lock for pid %d.', key, pid)
+                    data[key] = list(elements)
         except (FileNotFoundError, ValueError):
         except (FileNotFoundError, ValueError):
             # no or corrupt/empty roster file?
             # no or corrupt/empty roster file?
             data = {}
             data = {}

+ 10 - 5
src/borg/platform/posix.pyx

@@ -35,18 +35,23 @@ def get_process_id():
 
 
 
 
 def process_alive(host, pid, thread):
 def process_alive(host, pid, thread):
-    """Check if the (host, pid, thread_id) combination corresponds to a dead process on our local node or not."""
+    """
+    Check if the (host, pid, thread_id) combination corresponds to a potentially alive process.
+
+    If the process is local, then this will be accurate. If the process is not local, then this
+    returns always True, since there is no real way to check.
+    """
     from . import local_pid_alive
     from . import local_pid_alive
 
 
     if host != _hostname:
     if host != _hostname:
-        return False
+        return True
 
 
     if thread != 0:
     if thread != 0:
         # Currently thread is always 0, if we ever decide to set this to a non-zero value,
         # Currently thread is always 0, if we ever decide to set this to a non-zero value,
         # this code needs to be revisited, too, to do a sensible thing
         # this code needs to be revisited, too, to do a sensible thing
-        return False
+        return True
 
 
-    return local_pid_alive
+    return local_pid_alive(pid)
 
 
 def local_pid_alive(pid):
 def local_pid_alive(pid):
     """Return whether *pid* is alive."""
     """Return whether *pid* is alive."""
@@ -62,4 +67,4 @@ def local_pid_alive(pid):
             # ESRCH = no such process
             # ESRCH = no such process
             return False
             return False
         # Any other error (eg. permissions) mean that the process ID refers to a live process
         # Any other error (eg. permissions) mean that the process ID refers to a live process
-        return True
+        return True

+ 73 - 10
src/borg/testsuite/locking.py

@@ -1,22 +1,25 @@
+import random
 import time
 import time
 
 
 import pytest
 import pytest
 
 
-from ..locking import get_id, TimeoutTimer, ExclusiveLock, Lock, LockRoster, \
-                      ADD, REMOVE, SHARED, EXCLUSIVE, LockTimeout
-
+from ..platform import get_process_id, process_alive
+from ..locking import TimeoutTimer, ExclusiveLock, Lock, LockRoster, \
+                      ADD, REMOVE, SHARED, EXCLUSIVE, LockTimeout, NotLocked, NotMyLock
 
 
 ID1 = "foo", 1, 1
 ID1 = "foo", 1, 1
 ID2 = "bar", 2, 2
 ID2 = "bar", 2, 2
 
 
 
 
-def test_id():
-    hostname, pid, tid = get_id()
-    assert isinstance(hostname, str)
-    assert isinstance(pid, int)
-    assert isinstance(tid, int)
-    assert len(hostname) > 0
-    assert pid > 0
+@pytest.fixture()
+def free_pid():
+    """Return a free PID not used by any process (naturally this is racy)"""
+    host, pid, tid = get_process_id()
+    while True:
+        # PIDs are often restricted to a small range. On Linux the range >32k is by default not used.
+        pid = random.randint(33000, 65000)
+        if not process_alive(host, pid, tid):
+            return pid
 
 
 
 
 class TestTimeoutTimer:
 class TestTimeoutTimer:
@@ -57,6 +60,22 @@ class TestExclusiveLock:
             with pytest.raises(LockTimeout):
             with pytest.raises(LockTimeout):
                 ExclusiveLock(lockpath, id=ID2, timeout=0.1).acquire()
                 ExclusiveLock(lockpath, id=ID2, timeout=0.1).acquire()
 
 
+    def test_kill_stale(self, lockpath, free_pid):
+        host, pid, tid = our_id = get_process_id()
+        dead_id = host, free_pid, tid
+        cant_know_if_dead_id = 'foo.bar.example.net', 1, 2
+
+        dead_lock = ExclusiveLock(lockpath, id=dead_id).acquire()
+        with ExclusiveLock(lockpath, id=our_id, kill_stale_locks=True):
+            with pytest.raises(NotMyLock):
+                dead_lock.release()
+        with pytest.raises(NotLocked):
+            dead_lock.release()
+
+        with ExclusiveLock(lockpath, id=cant_know_if_dead_id):
+            with pytest.raises(LockTimeout):
+                ExclusiveLock(lockpath, id=our_id, kill_stale_locks=True, timeout=0.1).acquire()
+
 
 
 class TestLock:
 class TestLock:
     def test_shared(self, lockpath):
     def test_shared(self, lockpath):
@@ -117,6 +136,25 @@ class TestLock:
             with pytest.raises(LockTimeout):
             with pytest.raises(LockTimeout):
                 Lock(lockpath, exclusive=True, id=ID2, timeout=0.1).acquire()
                 Lock(lockpath, exclusive=True, id=ID2, timeout=0.1).acquire()
 
 
+    def test_kill_stale(self, lockpath, free_pid):
+        host, pid, tid = our_id = get_process_id()
+        dead_id = host, free_pid, tid
+        cant_know_if_dead_id = 'foo.bar.example.net', 1, 2
+
+        dead_lock = Lock(lockpath, id=dead_id, exclusive=True).acquire()
+        roster = dead_lock._roster
+        with Lock(lockpath, id=our_id, kill_stale_locks=True):
+            assert roster.get(EXCLUSIVE) == set()
+            assert roster.get(SHARED) == {our_id}
+        assert roster.get(EXCLUSIVE) == set()
+        assert roster.get(SHARED) == set()
+        with pytest.raises(KeyError):
+            dead_lock.release()
+
+        with Lock(lockpath, id=cant_know_if_dead_id, exclusive=True):
+            with pytest.raises(LockTimeout):
+                Lock(lockpath, id=our_id, kill_stale_locks=True, timeout=0.1).acquire()
+
 
 
 @pytest.fixture()
 @pytest.fixture()
 def rosterpath(tmpdir):
 def rosterpath(tmpdir):
@@ -144,3 +182,28 @@ class TestLockRoster:
         roster2 = LockRoster(rosterpath, id=ID2)
         roster2 = LockRoster(rosterpath, id=ID2)
         roster2.modify(SHARED, REMOVE)
         roster2.modify(SHARED, REMOVE)
         assert roster2.get(SHARED) == set()
         assert roster2.get(SHARED) == set()
+
+    def test_kill_stale(self, rosterpath, free_pid):
+        host, pid, tid = our_id = get_process_id()
+        dead_id = host, free_pid, tid
+
+        roster1 = LockRoster(rosterpath, id=dead_id)
+        assert roster1.get(SHARED) == set()
+        roster1.modify(SHARED, ADD)
+        assert roster1.get(SHARED) == {dead_id}
+
+        cant_know_if_dead_id = 'foo.bar.example.net', 1, 2
+        roster1 = LockRoster(rosterpath, id=cant_know_if_dead_id)
+        assert roster1.get(SHARED) == {dead_id}
+        roster1.modify(SHARED, ADD)
+        assert roster1.get(SHARED) == {dead_id, cant_know_if_dead_id}
+
+        killer_roster = LockRoster(rosterpath, kill_stale_locks=True)
+        # Did kill the dead processes lock (which was alive ... I guess?!)
+        assert killer_roster.get(SHARED) == {cant_know_if_dead_id}
+        killer_roster.modify(SHARED, ADD)
+        assert killer_roster.get(SHARED) == {our_id, cant_know_if_dead_id}
+
+        other_killer_roster = LockRoster(rosterpath, kill_stale_locks=True)
+        # Did not kill us, since we're alive
+        assert other_killer_roster.get(SHARED) == {our_id, cant_know_if_dead_id}

+ 22 - 0
src/borg/testsuite/platform.py

@@ -1,5 +1,6 @@
 import functools
 import functools
 import os
 import os
+import random
 import shutil
 import shutil
 import sys
 import sys
 import tempfile
 import tempfile
@@ -7,7 +8,9 @@ import pwd
 import unittest
 import unittest
 
 
 from ..platform import acl_get, acl_set, swidth
 from ..platform import acl_get, acl_set, swidth
+from ..platform import get_process_id, process_alive
 from . import BaseTestCase, unopened_tempfile
 from . import BaseTestCase, unopened_tempfile
+from .locking import free_pid
 
 
 
 
 ACCESS_ACL = """
 ACCESS_ACL = """
@@ -186,3 +189,22 @@ class PlatformPosixTestCase(BaseTestCase):
 
 
     def test_swidth_mixed(self):
     def test_swidth_mixed(self):
         self.assert_equal(swidth("borgバックアップ"), 4 + 6 * 2)
         self.assert_equal(swidth("borgバックアップ"), 4 + 6 * 2)
+
+
+def test_process_alive(free_pid):
+    id = get_process_id()
+    assert process_alive(*id)
+    host, pid, tid = id
+    assert process_alive(host + 'abc', pid, tid)
+    assert process_alive(host, pid, tid + 1)
+    assert not process_alive(host, free_pid, tid)
+
+
+def test_process_id():
+    hostname, pid, tid = get_process_id()
+    assert isinstance(hostname, str)
+    assert isinstance(pid, int)
+    assert isinstance(tid, int)
+    assert len(hostname) > 0
+    assert pid > 0
+    assert get_process_id() == (hostname, pid, tid)