diff options
author | Lars Wirzenius <liw@liw.fi> | 2012-11-19 19:14:10 +0000 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2012-11-19 19:14:10 +0000 |
commit | 6f3cf8fc4adbc1e801e0f35d9dc3df4c539b66c5 (patch) | |
tree | 387bcae60aa16dd39bd84716f17df9b6d50a2c73 | |
parent | c730d554a92cdf9e313006cff3fd227023fb848f (diff) | |
parent | e38345111f7effb648e49fdc0b2bc434b5e7c4ec (diff) | |
download | obnam-6f3cf8fc4adbc1e801e0f35d9dc3df4c539b66c5.tar.gz |
Handle shared B-tree lookup errors during backups
-rw-r--r-- | NEWS | 4 | ||||
-rwxr-xr-x | check-lock-usage-from-log | 316 | ||||
-rwxr-xr-x | lock-and-increment | 2 | ||||
-rw-r--r-- | obnamlib/plugins/backup_plugin.py | 13 | ||||
-rw-r--r-- | obnamlib/vfs_local.py | 19 | ||||
-rwxr-xr-x | test-many-generations | 2 |
6 files changed, 349 insertions, 7 deletions
@@ -35,6 +35,10 @@ Bug fixes: * Obnam now runs GnuPG explicitly with the "no text mode" setting, to override a "text mode" setting in the user's configuration. The files Obnam encrypts need to be treated as binary, not text files. Reported by Robin Sheat. +* A shared B-tree concurrency bug has been fixed: If another instance of + Obnam was modifying a shared B-tree, Obnam would crash and abort a backup, + possibly leaving lock files lying around. Now a failure to look up a chunk + via its checksum is ignored, and the backup continues. Version 1.2, released 2012-10-06 -------------------------------- diff --git a/check-lock-usage-from-log b/check-lock-usage-from-log new file mode 100755 index 00000000..6acc2402 --- /dev/null +++ b/check-lock-usage-from-log @@ -0,0 +1,316 @@ +#!/usr/bin/python +# Copyright 2012 Lars Wirzenius +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +'''Check lock file usage from log files. + +This program reads a number of Obnam log files, produced with tracing +for obnamlib, and analyses them for bugs when using lock files. Each +log file is assumed to be produced by a separate Obnam instance. + +* Have any instances held the same lock during overlapping periods? + +''' + + +import cliapp +import logging +import os +import re +import time +import ttystatus + +timestamp_pat = \ + r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d (?P<timestamp>\d+\.\d+) .*' + +lock_pat = re.compile( + timestamp_pat + + r'vfs_local.py:[0-9]*:lock: got lockname=(?P<lock>.*)') +unlock_pat = re.compile( + timestamp_pat + + r'vfs_local.py:[0-9]*:unlock: lockname=(?P<lock>.*)') + +writefile_pat = re.compile( + timestamp_pat + + r'vfs_local.py:[0-9]*:write_file: write_file (?P<filename>.*)$') +overwritefile_pat = re.compile( + timestamp_pat + + r'vfs_local.py:[0-9]*:overwrite_file: overwrite_file (?P<filename>.*)$') + +node_open_pat = re.compile( + timestamp_pat + + r'nodestore_disk.py:\d+:get_node: reading node \d+ from file ' + r'(?P<nodeid>.*)$') +node_remove_pat = re.compile( + timestamp_pat + + r'vfs_local.py:\d+:remove: remove (?P<nodeid>.*/nodes/.*)$') + +rename_pat = re.compile( + timestamp_pat + + r'vfs_local.py:\d+:rename: rename (?P<old>\S+) (?P<new>\S+)$') + + +class LogEvent(object): + + def __init__(self, logfile, lineno, timestamp): + self.logfile = logfile + self.lineno = lineno + self.timestamp = timestamp + + def sortkey(self): + return self.timestamp + + +class LockEvent(LogEvent): + + def __init__(self, logfile, lineno, timestamp, lockname): + LogEvent.__init__(self, logfile, lineno, timestamp) + self.lockname = lockname + + def __str__(self): + return 'Lock(%s)' % self.lockname + + +class UnlockEvent(LockEvent): + + def __str__(self): + return 'Unlock(%s)' % self.lockname + + +class WriteFileEvent(LogEvent): + + def __init__(self, logfile, lineno, timestamp, filename): + LogEvent.__init__(self, logfile, lineno, timestamp) + self.filename = filename + + def __str__(self): + return 'WriteFile(%s)' % self.filename + + +class OverwriteFileEvent(WriteFileEvent): + + def __str__(self): + return 'OverwriteFile(%s)' % self.filename + + +class NodeCreateEvent(LogEvent): + + def __init__(self, logfile, lineno, timestamp, node_id): + LogEvent.__init__(self, logfile, lineno, timestamp) + self.node_id = node_id + + def __str__(self): + return 'NodeCreate(%s)' % self.node_id + + +class NodeDestroyEvent(NodeCreateEvent): + + def __str__(self): + return 'NodeDestroy(%s)' % self.node_id + + +class NodeReadEvent(NodeCreateEvent): + + def __str__(self): + return 'NodeOpen(%s)' % self.node_id + + +class RenameEvent(LogEvent): + + def __init__(self, logfile, lineno, timestamp, old, new): + LogEvent.__init__(self, logfile, lineno, timestamp) + self.old = old + self.new = new + + def __str__(self): + return 'Rename(%s -> %s)' % (self.old, self.new) + + +class CheckLocks(cliapp.Application): + + def setup(self): + self.events = [] + self.errors = 0 + self.latest_opened_node = None + + self.patterns = [ + (lock_pat, self.lock_event), + (unlock_pat, self.unlock_event), + (writefile_pat, self.writefile_event), + (overwritefile_pat, self.overwritefile_event), + (node_open_pat, self.read_node_event), + (node_remove_pat, self.node_remove_event), + (rename_pat, self.rename_event), + ] + + self.ts = ttystatus.TerminalStatus() + self.ts.format( + 'Reading %ElapsedTime() %Integer(lines): %Pathname(filename)') + self.ts['lines'] = 0 + + def cleanup(self): + self.ts.clear() + + self.analyse_phase_1() + + self.ts.finish() + if self.errors: + raise cliapp.AppException('There were %d errors' % self.errors) + + def error(self, msg): + logging.error(msg) + self.ts.error(msg) + self.errors += 1 + + def analyse_phase_1(self): + self.events.sort(key=lambda e: e.sortkey()) + self.events = self.create_node_events(self.events) + self.ts.format('Phase 1: %Index(event,events)') + self.ts['events'] = self.events + self.ts.flush() + + current_locks = set() + current_nodes = set() + + for e in self.events: + self.ts['event'] = e + logging.debug( + 'analysing: %s:%s: %s: %s' % + (e.logfile, e.lineno, repr(e.sortkey()), str(e))) + if type(e) is LockEvent: + if e.lockname in current_locks: + self.error( + 'Re-locking %s: %s:%s:%s' % + (e.lockname, e.logfile, e.lineno, + e.timestamp)) + else: + current_locks.add(e.lockname) + elif type(e) is UnlockEvent: + if e.lockname not in current_locks: + self.error( + 'Unlocking %s which was not locked: %s:%s:%s' % + (e.lockname, e.logfile, e.lineno, + e.timestamp)) + else: + current_locks.remove(e.lockname) + elif type(e) in (WriteFileEvent, OverwriteFileEvent): + lockname = self.determine_lockfile(e.filename) + if lockname and lockname not in current_locks: + self.error( + '%s:%s: ' + 'Write to file %s despite lock %s not existing' % + (e.logfile, e.lineno, e.filename, lockname)) + elif type(e) is NodeCreateEvent: + if e.node_id in current_nodes: + self.error( + '%s:%s: Node %s already exists' % + (e.logfile, e.lineno, e.node_id)) + else: + current_nodes.add(e.node_id) + elif type(e) is NodeDestroyEvent: + if e.node_id not in current_nodes: + self.error( + '%s:%s: Node %s does not exist' % + (e.logfile, e.lineno, e.node_id)) + else: + current_nodes.remove(e.node_id) + elif type(e) is NodeReadEvent: + if e.node_id not in current_nodes: + self.error( + '%s:%s: Node %s does not exist' % + (e.logfile, e.lineno, e.node_id)) + elif type(e) is RenameEvent: + if e.old in current_nodes: + current_nodes.remove(e.old) + current_nodes.add(e.new) + else: + raise NotImplementedError() + + def create_node_events(self, events): + new = [] + for e in events: + new.append(e) + if type(e) in (WriteFileEvent, OverwriteFileEvent): + if '/nodes/' in e.filename: + new_e = NodeCreateEvent( + e.logfile, e.lineno, e.timestamp, e.filename) + new_e.timestamp = e.timestamp + new.append(new_e) + return new + + def determine_lockfile(self, filename): + if filename.endswith('/lock'): + return None + toplevel = filename.split('/')[0] + if toplevel == 'chunks': + return None + if toplevel in ('metadata', 'clientlist'): + return './lock' + return toplevel + '/lock' + + def process_input(self, name): + self.ts['filename'] = name + return cliapp.Application.process_input(self, name) + + def process_input_line(self, filename, line): + self.ts['lines'] = self.global_lineno + for pat, func in self.patterns: + m = pat.search(line) + if m: + event = func(filename, line, m) + if event is not None: + self.events.append(event) + + def lock_event(self, filename, line, match): + return LockEvent( + filename, self.lineno, float(match.group('timestamp')), + match.group('lock')) + + def unlock_event(self, filename, line, match): + return UnlockEvent( + filename, self.lineno, float(match.group('timestamp')), + match.group('lock')) + + def writefile_event(self, filename, line, match): + return WriteFileEvent( + filename, self.lineno, float(match.group('timestamp')), + match.group('filename')) + + def overwritefile_event(self, filename, line, match): + return OverwriteFileEvent( + filename, self.lineno, float(match.group('timestamp')), + match.group('filename')) + + def read_node_event(self, filename, line, match): + node_id = match.group('nodeid') + if not os.path.basename(node_id).startswith('tmp'): + return NodeReadEvent( + filename, self.lineno, float(match.group('timestamp')), + node_id) + + def node_remove_event(self, filename, line, match): + return NodeDestroyEvent( + filename, self.lineno, float(match.group('timestamp')), + match.group('nodeid')) + + def rename_event(self, filename, line, match): + return RenameEvent( + filename, self.lineno, float(match.group('timestamp')), + match.group('old'), match.group('new')) + + +CheckLocks().run() + diff --git a/lock-and-increment b/lock-and-increment index d7b945c1..18935762 100755 --- a/lock-and-increment +++ b/lock-and-increment @@ -26,7 +26,7 @@ count_to = int(sys.argv[2]) filename = os.path.join(dirname, 'counter') fs = obnamlib.LocalFS('/') -lm = obnamlib.LockManager(fs, 60) +lm = obnamlib.LockManager(fs, 60, 'lock-and-increment') for i in range(count_to): lm.lock([dirname]) diff --git a/obnamlib/plugins/backup_plugin.py b/obnamlib/plugins/backup_plugin.py index 9907cc3d..43973fa0 100644 --- a/obnamlib/plugins/backup_plugin.py +++ b/obnamlib/plugins/backup_plugin.py @@ -27,6 +27,7 @@ import tracing import ttystatus import obnamlib +import larch class ChunkidPool(object): @@ -610,8 +611,16 @@ class BackupPlugin(obnamlib.ObnamPlugin): '''Back up a chunk of data by putting it into the repository.''' def find(): - return (self.repo.find_chunks(checksum) + - self.chunkid_pool.get(checksum)) + # We ignore lookup errors here intentionally. We're reading + # the checksum trees without a lock, so another Obnam may be + # modifying them, which can lead to spurious NodeMissing + # exceptions, and other errors. We don't care: we'll just + # pretend no chunk with the checksum exists yet. + try: + in_tree = self.repo.find_chunks(checksum) + except larch.Error: + in_tree = [] + return in_tree + self.chunkid_pool.get(checksum) def get(chunkid): return self.repo.get_chunk(chunkid) diff --git a/obnamlib/vfs_local.py b/obnamlib/vfs_local.py index 9b456f9f..bf3250a5 100644 --- a/obnamlib/vfs_local.py +++ b/obnamlib/vfs_local.py @@ -23,6 +23,7 @@ import math import os import pwd import tempfile +import time import tracing import obnamlib @@ -56,6 +57,10 @@ class LocalFS(obnamlib.VirtualFileSystem): tracing.trace('create=%s', create) obnamlib.VirtualFileSystem.__init__(self, baseurl) self.reinit(baseurl, create=create) + + # For checking that we do not unlock something we didn't lock + # ourselves. + self.our_locks = set() # For testing purposes, allow setting a limit on write operations # after which an exception gets raised. If set to None, no crash. @@ -101,7 +106,7 @@ class LocalFS(obnamlib.VirtualFileSystem): self.cwd = newcwd def lock(self, lockname, data): - tracing.trace('lockname=%s', lockname) + tracing.trace('attempting lockname=%s', lockname) try: self.write_file(lockname, data) except OSError, e: @@ -109,11 +114,16 @@ class LocalFS(obnamlib.VirtualFileSystem): raise obnamlib.LockFail("Lock %s already exists" % lockname) else: raise # pragma: no cover + tracing.trace('got lockname=%s', lockname) + tracing.trace('time=%f' % time.time()) + self.our_locks.add(lockname) def unlock(self, lockname): tracing.trace('lockname=%s', lockname) - if self.exists(lockname): - self.remove(lockname) + assert lockname in self.our_locks + self.remove(lockname) + self.our_locks.remove(lockname) + tracing.trace('time=%f' % time.time()) def join(self, pathname): return os.path.join(self.cwd, pathname) @@ -253,6 +263,7 @@ class LocalFS(obnamlib.VirtualFileSystem): self.maybe_crash() def cat(self, pathname): + tracing.trace('pathname=%s' % pathname) pathname = self.join(pathname) f = self.open(pathname, 'rb') chunks = [] @@ -291,6 +302,7 @@ class LocalFS(obnamlib.VirtualFileSystem): pass else: os.remove(tempname) + tracing.trace('link+remove worked') return # Nope, didn't work. Now try with O_EXCL instead. @@ -302,6 +314,7 @@ class LocalFS(obnamlib.VirtualFileSystem): # Give up. os.remove(tempname) raise + tracing.trace('O_EXCL+rename worked') self.maybe_crash() diff --git a/test-many-generations b/test-many-generations index d1acefc4..7a6476c0 100755 --- a/test-many-generations +++ b/test-many-generations @@ -41,7 +41,7 @@ cat <<EOF > "$conf" client-name = $client quiet = yes log = $client.log -trace = obnamlib +trace = obnamlib, larch repository = $repourl root = $root EOF |