diff options
author | Lars Wirzenius <liw@liw.fi> | 2013-11-03 16:54:14 +0000 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2013-11-06 20:28:54 +0000 |
commit | fc327727c2d635fd7f15b81e8caccd6df6fe1ca0 (patch) | |
tree | 2a350f94852cb61a6be257d85d6b2665e35381e1 | |
parent | cf107f03af2038eb16dc4d30f9a8273f6560fd10 (diff) | |
download | larch-fc327727c2d635fd7f15b81e8caccd6df6fe1ca0.tar.gz |
Don't save refcount groups that haven't changed
This bug was exposed by a test data set provided by Rob Kendrick.
Saving reference count groups would sometimes overwrite a correct
saved group with one that had all counts set to zero. This would
happen when the "dirty set" (node ids whose refcounts had been changed
but not yet saved) contained, for example, node ids for the Nth and
N+2th group, but not the N+1th group between them. The old
save_refcounts logic would blithely save that group anyway, and if the
in-memory reference counts happened to not be in the refcount dict, it
would save zeroes instead.
To fix this, save_refcounts now only saves groups that have any dirty
refcounts, and skips saving a refcount group that is all clean.
To do this efficiently, we need to change the encode_refcounts
function signature, to get the set of keys it is to actually put into
the group. This set is now computed by save_refcounts.
This meant that all call sites for encode_refcounts need to be fixed
as well. Luckily the fix is easy: there's only one production use of
it, the rest is tests or benchmarks.
-rw-r--r-- | larch/refcountstore.py | 21 | ||||
-rw-r--r-- | larch/refcountstore_tests.py | 4 | ||||
-rwxr-xr-x | refcount-speed | 14 |
3 files changed, 22 insertions, 17 deletions
diff --git a/larch/refcountstore.py b/larch/refcountstore.py index 18fb9e7..6a9411f 100644 --- a/larch/refcountstore.py +++ b/larch/refcountstore.py @@ -24,12 +24,10 @@ import tracing import larch -def encode_refcounts(refcounts, start_id, how_many): +def encode_refcounts(refcounts, start_id, how_many, keys): fmt = '!QH' + 'H' * how_many args = [start_id, how_many] + ([0] * how_many) - keys = set(refcounts.keys()) - wanted = set(range(start_id, start_id + how_many)) - for key in wanted.intersection(keys): + for key in keys: args[2 + key - start_id] = refcounts[key] return struct.pack(fmt, *args) @@ -91,13 +89,19 @@ class RefcountStore(object): if not self.node_store.journal.exists(dirname): self.node_store.journal.makedirs(dirname) ids = sorted(self.dirty) + all_ids_in_memory = set(self.refcounts.keys()) for start_id in range(self._start_id(ids[0]), self._start_id(ids[-1]) + 1, self.per_group): - encoded = encode_refcounts(self.refcounts, start_id, - self.per_group) - filename = self._group_filename(start_id) - self.node_store.journal.overwrite_file(filename, encoded) + + all_ids_in_group = set( + range(start_id, start_id + self.per_group)) + keys = all_ids_in_group.intersection(all_ids_in_memory) + if keys: + encoded = encode_refcounts( + self.refcounts, start_id, self.per_group, keys) + filename = self._group_filename(start_id) + self.node_store.journal.overwrite_file(filename, encoded) # We re-initialize these so that they don't grow indefinitely. self.refcounts = dict() @@ -115,4 +119,3 @@ class RefcountStore(object): def _start_id(self, node_id): return (node_id / self.per_group) * self.per_group - diff --git a/larch/refcountstore_tests.py b/larch/refcountstore_tests.py index f18e29b..c38f120 100644 --- a/larch/refcountstore_tests.py +++ b/larch/refcountstore_tests.py @@ -91,8 +91,8 @@ class RefcountStoreTests(unittest.TestCase): refs = range(2048) for ref in refs: self.rs.set_refcount(ref, ref) - encoded = larch.refcountstore.encode_refcounts(self.rs.refcounts, - 0, 1024) + encoded = larch.refcountstore.encode_refcounts( + self.rs.refcounts, 0, 1024, range(1024)) decoded = larch.refcountstore.decode_refcounts(encoded) self.assertEqual(decoded, [(x, x) for x in refs[:1024]]) diff --git a/refcount-speed b/refcount-speed index eb489ed..5cbe797 100755 --- a/refcount-speed +++ b/refcount-speed @@ -72,12 +72,14 @@ class RefcountSpeedTest(cliapp.Application): # Calibrate. looptime = self.measure(nop, 'calibrate') - encode = self.measure(lambda: - larch.refcountstore.encode_refcounts(refcounts, - 0, len(refcounts)), - 'encode') - encoded = larch.refcountstore.encode_refcounts(refcounts, 0, - len(refcounts)) + num_refcounts = len(refcounts) + keys = refcounts.keys() + encode = self.measure( + lambda: larch.refcountstore.encode_refcounts( + refcounts, 0, num_refcounts, keys), + 'encode') + encoded = larch.refcountstore.encode_refcounts( + refcounts, 0, num_refcounts, keys) decode = self.measure(lambda: larch.refcountstore.decode_refcounts(encoded), 'decode') |