From 0fc9df23884c5cacb6be2f9866bd5c8a70f8f5e1 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Thu, 31 Oct 2013 20:03:39 +0000 Subject: Fix KeyError for missing refcount We would sometimes, rarely, have a crash that looked like this: File "..refcountstore.py", line 76, in get_refcount return self.refcounts[node_id] KeyError: 32970 In other words, the object keeping track of reference counts for B-tree nodes would think it doesn't have the reference count for a node, when it should have it. After much debugging, by myself and Itamar Turner-Trauring, and greatly helped by a repeatable test case provided by Rob Kendrick, this was tracked to an optimisation in the set_refcount method, which would remove (to save memory) the refcount for a node (remove the entry for a node's id in the refcount dict), when the refcount was set to zero. Unfortunately this conflicted with an assumption in get_refcount that a refcount for a node that was in the dirty set was actually present in the refcount dict. The bug fix is easy: don't remove the node from the refcount dict, and eat the (very minor) memory consumption hit, in the name of correctness. --- larch/refcountstore.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/larch/refcountstore.py b/larch/refcountstore.py index 0aea53e..18fb9e7 100644 --- a/larch/refcountstore.py +++ b/larch/refcountstore.py @@ -78,11 +78,7 @@ class RefcountStore(object): def set_refcount(self, node_id, refcount): '''Set the reference count for a given node.''' tracing.trace('setting refcoutn for %s to %s' % (node_id, refcount)) - if refcount == 0: - if node_id in self.refcounts: - del self.refcounts[node_id] - else: - self.refcounts[node_id] = refcount + self.refcounts[node_id] = refcount self.dirty.add(node_id) def save_refcounts(self): -- cgit v1.2.1 From cf107f03af2038eb16dc4d30f9a8273f6560fd10 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Wed, 6 Nov 2013 20:05:35 +0000 Subject: Delete wrong tests Since we no longer treat a refcount of 0 specially, these tests no longer make any sense. --- larch/refcountstore_tests.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/larch/refcountstore_tests.py b/larch/refcountstore_tests.py index f4f4ed1..f18e29b 100644 --- a/larch/refcountstore_tests.py +++ b/larch/refcountstore_tests.py @@ -68,17 +68,6 @@ class RefcountStoreTests(unittest.TestCase): self.rs.set_refcount(123, 1) self.assertEqual(self.rs.get_refcount(123), 1) - def test_does_not_set_refcount_if_zero(self): - self.rs.set_refcount(123, 0) - self.assertFalse(123 in self.rs.refcounts) - self.assertEqual(self.rs.get_refcount(123), 0) - - def test_removes_refcount_that_drops_to_zero(self): - self.rs.set_refcount(123, 1) - self.rs.set_refcount(123, 0) - self.assertFalse(123 in self.rs.refcounts) - self.assertEqual(self.rs.get_refcount(123), 0) - def test_updates_refcount(self): self.rs.set_refcount(123, 1) self.rs.set_refcount(123, 2) -- cgit v1.2.1 From fc327727c2d635fd7f15b81e8caccd6df6fe1ca0 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 3 Nov 2013 16:54:14 +0000 Subject: Don't save refcount groups that haven't changed This bug was exposed by a test data set provided by Rob Kendrick. Saving reference count groups would sometimes overwrite a correct saved group with one that had all counts set to zero. This would happen when the "dirty set" (node ids whose refcounts had been changed but not yet saved) contained, for example, node ids for the Nth and N+2th group, but not the N+1th group between them. The old save_refcounts logic would blithely save that group anyway, and if the in-memory reference counts happened to not be in the refcount dict, it would save zeroes instead. To fix this, save_refcounts now only saves groups that have any dirty refcounts, and skips saving a refcount group that is all clean. To do this efficiently, we need to change the encode_refcounts function signature, to get the set of keys it is to actually put into the group. This set is now computed by save_refcounts. This meant that all call sites for encode_refcounts need to be fixed as well. Luckily the fix is easy: there's only one production use of it, the rest is tests or benchmarks. --- larch/refcountstore.py | 21 ++++++++++++--------- larch/refcountstore_tests.py | 4 ++-- refcount-speed | 14 ++++++++------ 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/larch/refcountstore.py b/larch/refcountstore.py index 18fb9e7..6a9411f 100644 --- a/larch/refcountstore.py +++ b/larch/refcountstore.py @@ -24,12 +24,10 @@ import tracing import larch -def encode_refcounts(refcounts, start_id, how_many): +def encode_refcounts(refcounts, start_id, how_many, keys): fmt = '!QH' + 'H' * how_many args = [start_id, how_many] + ([0] * how_many) - keys = set(refcounts.keys()) - wanted = set(range(start_id, start_id + how_many)) - for key in wanted.intersection(keys): + for key in keys: args[2 + key - start_id] = refcounts[key] return struct.pack(fmt, *args) @@ -91,13 +89,19 @@ class RefcountStore(object): if not self.node_store.journal.exists(dirname): self.node_store.journal.makedirs(dirname) ids = sorted(self.dirty) + all_ids_in_memory = set(self.refcounts.keys()) for start_id in range(self._start_id(ids[0]), self._start_id(ids[-1]) + 1, self.per_group): - encoded = encode_refcounts(self.refcounts, start_id, - self.per_group) - filename = self._group_filename(start_id) - self.node_store.journal.overwrite_file(filename, encoded) + + all_ids_in_group = set( + range(start_id, start_id + self.per_group)) + keys = all_ids_in_group.intersection(all_ids_in_memory) + if keys: + encoded = encode_refcounts( + self.refcounts, start_id, self.per_group, keys) + filename = self._group_filename(start_id) + self.node_store.journal.overwrite_file(filename, encoded) # We re-initialize these so that they don't grow indefinitely. self.refcounts = dict() @@ -115,4 +119,3 @@ class RefcountStore(object): def _start_id(self, node_id): return (node_id / self.per_group) * self.per_group - diff --git a/larch/refcountstore_tests.py b/larch/refcountstore_tests.py index f18e29b..c38f120 100644 --- a/larch/refcountstore_tests.py +++ b/larch/refcountstore_tests.py @@ -91,8 +91,8 @@ class RefcountStoreTests(unittest.TestCase): refs = range(2048) for ref in refs: self.rs.set_refcount(ref, ref) - encoded = larch.refcountstore.encode_refcounts(self.rs.refcounts, - 0, 1024) + encoded = larch.refcountstore.encode_refcounts( + self.rs.refcounts, 0, 1024, range(1024)) decoded = larch.refcountstore.decode_refcounts(encoded) self.assertEqual(decoded, [(x, x) for x in refs[:1024]]) diff --git a/refcount-speed b/refcount-speed index eb489ed..5cbe797 100755 --- a/refcount-speed +++ b/refcount-speed @@ -72,12 +72,14 @@ class RefcountSpeedTest(cliapp.Application): # Calibrate. looptime = self.measure(nop, 'calibrate') - encode = self.measure(lambda: - larch.refcountstore.encode_refcounts(refcounts, - 0, len(refcounts)), - 'encode') - encoded = larch.refcountstore.encode_refcounts(refcounts, 0, - len(refcounts)) + num_refcounts = len(refcounts) + keys = refcounts.keys() + encode = self.measure( + lambda: larch.refcountstore.encode_refcounts( + refcounts, 0, num_refcounts, keys), + 'encode') + encoded = larch.refcountstore.encode_refcounts( + refcounts, 0, num_refcounts, keys) decode = self.measure(lambda: larch.refcountstore.decode_refcounts(encoded), 'decode') -- cgit v1.2.1 From 1d31b862b1f0fc6732d0c2811b7775025039db89 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Wed, 6 Nov 2013 20:34:19 +0000 Subject: Update NEWS about bug fixes --- NEWS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/NEWS b/NEWS index a1ffb28..f2e8276 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,15 @@ copy-on-write B-tree, designed by Ohad Rodeh. Version 1.UNRELEASED ------------------ +* Serious bug fixed: the "KeyError" crash for reference counts. This + was false memory use optimisation, which triggered a rare bug in + related code. Repeatable test case by Rob Kendrick, and helpful + analysis by Itamar Turing-Trauring. + +* Serious bug fixed: another "node missing" bug. This crash was + caused by a bug that overwrote on-disk reference count groups + with zeroes. Repeatable test case by Rob Kendrick. + * Fixes to fsck from Antoine Brenner. Version 1.20130808 -- cgit v1.2.1