summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2012-11-19 19:14:10 +0000
committerLars Wirzenius <liw@liw.fi>2012-11-19 19:14:10 +0000
commit6f3cf8fc4adbc1e801e0f35d9dc3df4c539b66c5 (patch)
tree387bcae60aa16dd39bd84716f17df9b6d50a2c73
parentc730d554a92cdf9e313006cff3fd227023fb848f (diff)
parente38345111f7effb648e49fdc0b2bc434b5e7c4ec (diff)
downloadobnam-6f3cf8fc4adbc1e801e0f35d9dc3df4c539b66c5.tar.gz
Handle shared B-tree lookup errors during backups
-rw-r--r--NEWS4
-rwxr-xr-xcheck-lock-usage-from-log316
-rwxr-xr-xlock-and-increment2
-rw-r--r--obnamlib/plugins/backup_plugin.py13
-rw-r--r--obnamlib/vfs_local.py19
-rwxr-xr-xtest-many-generations2
6 files changed, 349 insertions, 7 deletions
diff --git a/NEWS b/NEWS
index 16750715..c970eba3 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,10 @@ Bug fixes:
* Obnam now runs GnuPG explicitly with the "no text mode" setting, to override
a "text mode" setting in the user's configuration. The files Obnam encrypts
need to be treated as binary, not text files. Reported by Robin Sheat.
+* A shared B-tree concurrency bug has been fixed: If another instance of
+ Obnam was modifying a shared B-tree, Obnam would crash and abort a backup,
+ possibly leaving lock files lying around. Now a failure to look up a chunk
+ via its checksum is ignored, and the backup continues.
Version 1.2, released 2012-10-06
--------------------------------
diff --git a/check-lock-usage-from-log b/check-lock-usage-from-log
new file mode 100755
index 00000000..6acc2402
--- /dev/null
+++ b/check-lock-usage-from-log
@@ -0,0 +1,316 @@
+#!/usr/bin/python
+# Copyright 2012 Lars Wirzenius
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+'''Check lock file usage from log files.
+
+This program reads a number of Obnam log files, produced with tracing
+for obnamlib, and analyses them for bugs when using lock files. Each
+log file is assumed to be produced by a separate Obnam instance.
+
+* Have any instances held the same lock during overlapping periods?
+
+'''
+
+
+import cliapp
+import logging
+import os
+import re
+import time
+import ttystatus
+
+timestamp_pat = \
+ r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d (?P<timestamp>\d+\.\d+) .*'
+
+lock_pat = re.compile(
+ timestamp_pat +
+ r'vfs_local.py:[0-9]*:lock: got lockname=(?P<lock>.*)')
+unlock_pat = re.compile(
+ timestamp_pat +
+ r'vfs_local.py:[0-9]*:unlock: lockname=(?P<lock>.*)')
+
+writefile_pat = re.compile(
+ timestamp_pat +
+ r'vfs_local.py:[0-9]*:write_file: write_file (?P<filename>.*)$')
+overwritefile_pat = re.compile(
+ timestamp_pat +
+ r'vfs_local.py:[0-9]*:overwrite_file: overwrite_file (?P<filename>.*)$')
+
+node_open_pat = re.compile(
+ timestamp_pat +
+ r'nodestore_disk.py:\d+:get_node: reading node \d+ from file '
+ r'(?P<nodeid>.*)$')
+node_remove_pat = re.compile(
+ timestamp_pat +
+ r'vfs_local.py:\d+:remove: remove (?P<nodeid>.*/nodes/.*)$')
+
+rename_pat = re.compile(
+ timestamp_pat +
+ r'vfs_local.py:\d+:rename: rename (?P<old>\S+) (?P<new>\S+)$')
+
+
+class LogEvent(object):
+
+ def __init__(self, logfile, lineno, timestamp):
+ self.logfile = logfile
+ self.lineno = lineno
+ self.timestamp = timestamp
+
+ def sortkey(self):
+ return self.timestamp
+
+
+class LockEvent(LogEvent):
+
+ def __init__(self, logfile, lineno, timestamp, lockname):
+ LogEvent.__init__(self, logfile, lineno, timestamp)
+ self.lockname = lockname
+
+ def __str__(self):
+ return 'Lock(%s)' % self.lockname
+
+
+class UnlockEvent(LockEvent):
+
+ def __str__(self):
+ return 'Unlock(%s)' % self.lockname
+
+
+class WriteFileEvent(LogEvent):
+
+ def __init__(self, logfile, lineno, timestamp, filename):
+ LogEvent.__init__(self, logfile, lineno, timestamp)
+ self.filename = filename
+
+ def __str__(self):
+ return 'WriteFile(%s)' % self.filename
+
+
+class OverwriteFileEvent(WriteFileEvent):
+
+ def __str__(self):
+ return 'OverwriteFile(%s)' % self.filename
+
+
+class NodeCreateEvent(LogEvent):
+
+ def __init__(self, logfile, lineno, timestamp, node_id):
+ LogEvent.__init__(self, logfile, lineno, timestamp)
+ self.node_id = node_id
+
+ def __str__(self):
+ return 'NodeCreate(%s)' % self.node_id
+
+
+class NodeDestroyEvent(NodeCreateEvent):
+
+ def __str__(self):
+ return 'NodeDestroy(%s)' % self.node_id
+
+
+class NodeReadEvent(NodeCreateEvent):
+
+ def __str__(self):
+ return 'NodeOpen(%s)' % self.node_id
+
+
+class RenameEvent(LogEvent):
+
+ def __init__(self, logfile, lineno, timestamp, old, new):
+ LogEvent.__init__(self, logfile, lineno, timestamp)
+ self.old = old
+ self.new = new
+
+ def __str__(self):
+ return 'Rename(%s -> %s)' % (self.old, self.new)
+
+
+class CheckLocks(cliapp.Application):
+
+ def setup(self):
+ self.events = []
+ self.errors = 0
+ self.latest_opened_node = None
+
+ self.patterns = [
+ (lock_pat, self.lock_event),
+ (unlock_pat, self.unlock_event),
+ (writefile_pat, self.writefile_event),
+ (overwritefile_pat, self.overwritefile_event),
+ (node_open_pat, self.read_node_event),
+ (node_remove_pat, self.node_remove_event),
+ (rename_pat, self.rename_event),
+ ]
+
+ self.ts = ttystatus.TerminalStatus()
+ self.ts.format(
+ 'Reading %ElapsedTime() %Integer(lines): %Pathname(filename)')
+ self.ts['lines'] = 0
+
+ def cleanup(self):
+ self.ts.clear()
+
+ self.analyse_phase_1()
+
+ self.ts.finish()
+ if self.errors:
+ raise cliapp.AppException('There were %d errors' % self.errors)
+
+ def error(self, msg):
+ logging.error(msg)
+ self.ts.error(msg)
+ self.errors += 1
+
+ def analyse_phase_1(self):
+ self.events.sort(key=lambda e: e.sortkey())
+ self.events = self.create_node_events(self.events)
+ self.ts.format('Phase 1: %Index(event,events)')
+ self.ts['events'] = self.events
+ self.ts.flush()
+
+ current_locks = set()
+ current_nodes = set()
+
+ for e in self.events:
+ self.ts['event'] = e
+ logging.debug(
+ 'analysing: %s:%s: %s: %s' %
+ (e.logfile, e.lineno, repr(e.sortkey()), str(e)))
+ if type(e) is LockEvent:
+ if e.lockname in current_locks:
+ self.error(
+ 'Re-locking %s: %s:%s:%s' %
+ (e.lockname, e.logfile, e.lineno,
+ e.timestamp))
+ else:
+ current_locks.add(e.lockname)
+ elif type(e) is UnlockEvent:
+ if e.lockname not in current_locks:
+ self.error(
+ 'Unlocking %s which was not locked: %s:%s:%s' %
+ (e.lockname, e.logfile, e.lineno,
+ e.timestamp))
+ else:
+ current_locks.remove(e.lockname)
+ elif type(e) in (WriteFileEvent, OverwriteFileEvent):
+ lockname = self.determine_lockfile(e.filename)
+ if lockname and lockname not in current_locks:
+ self.error(
+ '%s:%s: '
+ 'Write to file %s despite lock %s not existing' %
+ (e.logfile, e.lineno, e.filename, lockname))
+ elif type(e) is NodeCreateEvent:
+ if e.node_id in current_nodes:
+ self.error(
+ '%s:%s: Node %s already exists' %
+ (e.logfile, e.lineno, e.node_id))
+ else:
+ current_nodes.add(e.node_id)
+ elif type(e) is NodeDestroyEvent:
+ if e.node_id not in current_nodes:
+ self.error(
+ '%s:%s: Node %s does not exist' %
+ (e.logfile, e.lineno, e.node_id))
+ else:
+ current_nodes.remove(e.node_id)
+ elif type(e) is NodeReadEvent:
+ if e.node_id not in current_nodes:
+ self.error(
+ '%s:%s: Node %s does not exist' %
+ (e.logfile, e.lineno, e.node_id))
+ elif type(e) is RenameEvent:
+ if e.old in current_nodes:
+ current_nodes.remove(e.old)
+ current_nodes.add(e.new)
+ else:
+ raise NotImplementedError()
+
+ def create_node_events(self, events):
+ new = []
+ for e in events:
+ new.append(e)
+ if type(e) in (WriteFileEvent, OverwriteFileEvent):
+ if '/nodes/' in e.filename:
+ new_e = NodeCreateEvent(
+ e.logfile, e.lineno, e.timestamp, e.filename)
+ new_e.timestamp = e.timestamp
+ new.append(new_e)
+ return new
+
+ def determine_lockfile(self, filename):
+ if filename.endswith('/lock'):
+ return None
+ toplevel = filename.split('/')[0]
+ if toplevel == 'chunks':
+ return None
+ if toplevel in ('metadata', 'clientlist'):
+ return './lock'
+ return toplevel + '/lock'
+
+ def process_input(self, name):
+ self.ts['filename'] = name
+ return cliapp.Application.process_input(self, name)
+
+ def process_input_line(self, filename, line):
+ self.ts['lines'] = self.global_lineno
+ for pat, func in self.patterns:
+ m = pat.search(line)
+ if m:
+ event = func(filename, line, m)
+ if event is not None:
+ self.events.append(event)
+
+ def lock_event(self, filename, line, match):
+ return LockEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ match.group('lock'))
+
+ def unlock_event(self, filename, line, match):
+ return UnlockEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ match.group('lock'))
+
+ def writefile_event(self, filename, line, match):
+ return WriteFileEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ match.group('filename'))
+
+ def overwritefile_event(self, filename, line, match):
+ return OverwriteFileEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ match.group('filename'))
+
+ def read_node_event(self, filename, line, match):
+ node_id = match.group('nodeid')
+ if not os.path.basename(node_id).startswith('tmp'):
+ return NodeReadEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ node_id)
+
+ def node_remove_event(self, filename, line, match):
+ return NodeDestroyEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ match.group('nodeid'))
+
+ def rename_event(self, filename, line, match):
+ return RenameEvent(
+ filename, self.lineno, float(match.group('timestamp')),
+ match.group('old'), match.group('new'))
+
+
+CheckLocks().run()
+
diff --git a/lock-and-increment b/lock-and-increment
index d7b945c1..18935762 100755
--- a/lock-and-increment
+++ b/lock-and-increment
@@ -26,7 +26,7 @@ count_to = int(sys.argv[2])
filename = os.path.join(dirname, 'counter')
fs = obnamlib.LocalFS('/')
-lm = obnamlib.LockManager(fs, 60)
+lm = obnamlib.LockManager(fs, 60, 'lock-and-increment')
for i in range(count_to):
lm.lock([dirname])
diff --git a/obnamlib/plugins/backup_plugin.py b/obnamlib/plugins/backup_plugin.py
index 9907cc3d..43973fa0 100644
--- a/obnamlib/plugins/backup_plugin.py
+++ b/obnamlib/plugins/backup_plugin.py
@@ -27,6 +27,7 @@ import tracing
import ttystatus
import obnamlib
+import larch
class ChunkidPool(object):
@@ -610,8 +611,16 @@ class BackupPlugin(obnamlib.ObnamPlugin):
'''Back up a chunk of data by putting it into the repository.'''
def find():
- return (self.repo.find_chunks(checksum) +
- self.chunkid_pool.get(checksum))
+ # We ignore lookup errors here intentionally. We're reading
+ # the checksum trees without a lock, so another Obnam may be
+ # modifying them, which can lead to spurious NodeMissing
+ # exceptions, and other errors. We don't care: we'll just
+ # pretend no chunk with the checksum exists yet.
+ try:
+ in_tree = self.repo.find_chunks(checksum)
+ except larch.Error:
+ in_tree = []
+ return in_tree + self.chunkid_pool.get(checksum)
def get(chunkid):
return self.repo.get_chunk(chunkid)
diff --git a/obnamlib/vfs_local.py b/obnamlib/vfs_local.py
index 9b456f9f..bf3250a5 100644
--- a/obnamlib/vfs_local.py
+++ b/obnamlib/vfs_local.py
@@ -23,6 +23,7 @@ import math
import os
import pwd
import tempfile
+import time
import tracing
import obnamlib
@@ -56,6 +57,10 @@ class LocalFS(obnamlib.VirtualFileSystem):
tracing.trace('create=%s', create)
obnamlib.VirtualFileSystem.__init__(self, baseurl)
self.reinit(baseurl, create=create)
+
+ # For checking that we do not unlock something we didn't lock
+ # ourselves.
+ self.our_locks = set()
# For testing purposes, allow setting a limit on write operations
# after which an exception gets raised. If set to None, no crash.
@@ -101,7 +106,7 @@ class LocalFS(obnamlib.VirtualFileSystem):
self.cwd = newcwd
def lock(self, lockname, data):
- tracing.trace('lockname=%s', lockname)
+ tracing.trace('attempting lockname=%s', lockname)
try:
self.write_file(lockname, data)
except OSError, e:
@@ -109,11 +114,16 @@ class LocalFS(obnamlib.VirtualFileSystem):
raise obnamlib.LockFail("Lock %s already exists" % lockname)
else:
raise # pragma: no cover
+ tracing.trace('got lockname=%s', lockname)
+ tracing.trace('time=%f' % time.time())
+ self.our_locks.add(lockname)
def unlock(self, lockname):
tracing.trace('lockname=%s', lockname)
- if self.exists(lockname):
- self.remove(lockname)
+ assert lockname in self.our_locks
+ self.remove(lockname)
+ self.our_locks.remove(lockname)
+ tracing.trace('time=%f' % time.time())
def join(self, pathname):
return os.path.join(self.cwd, pathname)
@@ -253,6 +263,7 @@ class LocalFS(obnamlib.VirtualFileSystem):
self.maybe_crash()
def cat(self, pathname):
+ tracing.trace('pathname=%s' % pathname)
pathname = self.join(pathname)
f = self.open(pathname, 'rb')
chunks = []
@@ -291,6 +302,7 @@ class LocalFS(obnamlib.VirtualFileSystem):
pass
else:
os.remove(tempname)
+ tracing.trace('link+remove worked')
return
# Nope, didn't work. Now try with O_EXCL instead.
@@ -302,6 +314,7 @@ class LocalFS(obnamlib.VirtualFileSystem):
# Give up.
os.remove(tempname)
raise
+ tracing.trace('O_EXCL+rename worked')
self.maybe_crash()
diff --git a/test-many-generations b/test-many-generations
index d1acefc4..7a6476c0 100755
--- a/test-many-generations
+++ b/test-many-generations
@@ -41,7 +41,7 @@ cat <<EOF > "$conf"
client-name = $client
quiet = yes
log = $client.log
-trace = obnamlib
+trace = obnamlib, larch
repository = $repourl
root = $root
EOF