Don't save refcount groups that haven't changed

This bug was exposed by a test data set provided by Rob Kendrick. Saving reference count groups would sometimes overwrite a correct saved group with one that had all counts set to zero. This would happen when the "dirty set" (node ids whose refcounts had been changed but not yet saved) contained, for example, node ids for the Nth and N+2th group, but not the N+1th group between them. The old save_refcounts logic would blithely save that group anyway, and if the in-memory reference counts happened to not be in the refcount dict, it would save zeroes instead. To fix this, save_refcounts now only saves groups that have any dirty refcounts, and skips saving a refcount group that is all clean. To do this efficiently, we need to change the encode_refcounts function signature, to get the set of keys it is to actually put into the group. This set is now computed by save_refcounts. This meant that all call sites for encode_refcounts need to be fixed as well. Luckily the fix is easy: there's only one production use of it, the rest is tests or benchmarks.
author: Lars Wirzenius <liw@liw.fi> 2013-11-03 16:54:14 +0000
committer: Lars Wirzenius <liw@liw.fi> 2013-11-06 20:28:54 +0000
commit: fc327727c2d635fd7f15b81e8caccd6df6fe1ca0 (patch)
tree: 2a350f94852cb61a6be257d85d6b2665e35381e1
parent: cf107f03af2038eb16dc4d30f9a8273f6560fd10 (diff)
download: larch-fc327727c2d635fd7f15b81e8caccd6df6fe1ca0.tar.gz
3 files changed, 22 insertions, 17 deletions
diff --git a/larch/refcountstore.py b/larch/refcountstore.py
index 18fb9e7..6a9411f 100644
--- a/larch/refcountstore.py
+++ b/larch/refcountstore.py
@@ -24,12 +24,10 @@ import tracing
 import larch
 
 
-def encode_refcounts(refcounts, start_id, how_many):
+def encode_refcounts(refcounts, start_id, how_many, keys):
     fmt = '!QH' + 'H' * how_many
     args = [start_id, how_many] + ([0] * how_many)
-    keys = set(refcounts.keys())
-    wanted = set(range(start_id, start_id + how_many))
-    for key in wanted.intersection(keys):
+    for key in keys:
         args[2 + key - start_id] = refcounts[key]
     return struct.pack(fmt, *args)
     
@@ -91,13 +89,19 @@ class RefcountStore(object):
             if not self.node_store.journal.exists(dirname):
                 self.node_store.journal.makedirs(dirname)
             ids = sorted(self.dirty)
+            all_ids_in_memory = set(self.refcounts.keys())
             for start_id in range(self._start_id(ids[0]), 
                                   self._start_id(ids[-1]) + 1, 
                                   self.per_group):
-                encoded = encode_refcounts(self.refcounts, start_id, 
-                                           self.per_group)
-                filename = self._group_filename(start_id)
-                self.node_store.journal.overwrite_file(filename, encoded)
+
+                all_ids_in_group = set(
+                    range(start_id, start_id + self.per_group))
+                keys = all_ids_in_group.intersection(all_ids_in_memory)
+                if keys:
+                    encoded = encode_refcounts(
+                        self.refcounts, start_id, self.per_group, keys)
+                    filename = self._group_filename(start_id)
+                    self.node_store.journal.overwrite_file(filename, encoded)
 
         # We re-initialize these so that they don't grow indefinitely.
         self.refcounts = dict()
@@ -115,4 +119,3 @@ class RefcountStore(object):
 
     def _start_id(self, node_id):
         return (node_id / self.per_group) * self.per_group
-
diff --git a/larch/refcountstore_tests.py b/larch/refcountstore_tests.py
index f18e29b..c38f120 100644
--- a/larch/refcountstore_tests.py
+++ b/larch/refcountstore_tests.py
@@ -91,8 +91,8 @@ class RefcountStoreTests(unittest.TestCase):
         refs = range(2048)
         for ref in refs:
             self.rs.set_refcount(ref, ref)
-        encoded = larch.refcountstore.encode_refcounts(self.rs.refcounts, 
-                                                       0, 1024)
+        encoded = larch.refcountstore.encode_refcounts(
+            self.rs.refcounts, 0, 1024, range(1024))
         decoded = larch.refcountstore.decode_refcounts(encoded)
         self.assertEqual(decoded, [(x, x) for x in refs[:1024]])
 
diff --git a/refcount-speed b/refcount-speed
index eb489ed..5cbe797 100755
--- a/refcount-speed
+++ b/refcount-speed
@@ -72,12 +72,14 @@ class RefcountSpeedTest(cliapp.Application):
         # Calibrate.
         looptime = self.measure(nop, 'calibrate')
         
-        encode = self.measure(lambda:
-                                larch.refcountstore.encode_refcounts(refcounts,
-                                                           0, len(refcounts)),
-                              'encode')
-        encoded = larch.refcountstore.encode_refcounts(refcounts, 0, 
-                                                       len(refcounts))
+        num_refcounts = len(refcounts)
+        keys = refcounts.keys()
+        encode = self.measure(
+            lambda: larch.refcountstore.encode_refcounts(
+                refcounts, 0, num_refcounts, keys),
+            'encode')
+        encoded = larch.refcountstore.encode_refcounts(
+            refcounts, 0, num_refcounts, keys)
         decode = self.measure(lambda: 
                                 larch.refcountstore.decode_refcounts(encoded),
                               'decode')
author	Lars Wirzenius <liw@liw.fi>	2013-11-03 16:54:14 +0000
committer	Lars Wirzenius <liw@liw.fi>	2013-11-06 20:28:54 +0000
commit	fc327727c2d635fd7f15b81e8caccd6df6fe1ca0 (patch)
tree	2a350f94852cb61a6be257d85d6b2665e35381e1
parent	cf107f03af2038eb16dc4d30f9a8273f6560fd10 (diff)
download	larch-fc327727c2d635fd7f15b81e8caccd6df6fe1ca0.tar.gz