summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-17 08:48:14 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-17 08:48:14 +1200
commit99c264ba3f1aeccd039211cf6d9c8fd0d633a45a (patch)
tree0ed19fb8bffd90191c793535b3f9267876e87fa0
parent0bc1e2655caeca0b479f4a32bf3ebd56d1119e60 (diff)
downloadlarch-99c264ba3f1aeccd039211cf6d9c8fd0d633a45a.tar.gz
Split things into separate modules in the package.
-rw-r--r--Makefile4
-rw-r--r--btree/__init__.py642
-rw-r--r--btree/codec.py107
-rw-r--r--btree/codec_tests.py22
-rw-r--r--btree/nodes.py68
-rw-r--r--btree/nodes_tests.py49
-rw-r--r--btree/nodestore.py168
-rw-r--r--btree/tree.py297
-rw-r--r--btree/tree_tests.py (renamed from btree/__init___tests.py)63
-rw-r--r--without-tests2
10 files changed, 722 insertions, 700 deletions
diff --git a/Makefile b/Makefile
index 1f5e024..6aa8e08 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
all:
check: all
- python -m CoverageTestRunner
+ python -m CoverageTestRunner --ignore-missing-from=without-tests
rm .coverage
clean:
- rm -f .coverage *.pyc *.pyo
+ rm -f .coverage *.py[co] btree/*.py[co]
diff --git a/btree/__init__.py b/btree/__init__.py
index 4ebf088..5697f9e 100644
--- a/btree/__init__.py
+++ b/btree/__init__.py
@@ -1,636 +1,8 @@
-import struct
+from nodes import LeafNode, IndexNode
+from codec import NodeCodec
+from tree import BTree, KeySizeMismatch
+from nodestore import (NodeStore, NodeStoreTests, NodeMissing, NodeTooBig,
+ NodeExists)
+from nodestore_disk import NodeStoreDisk
+from nodestore_memory import NodeStoreMemory
-
-'''A simple B-tree implementation.
-
-Some notes:
-
-* No nodes are modified, everything is done copy-on-write. This is because
- eventually this code will be used to handle on-disk data structures where
- copy-on-write is essential.
-* The fullness of leaf and index nodes is determined by number of keys.
- This is appropriate for now, but eventually we will want to inspect the
- size in bytes of the nodes instead. This is also for on-disk data
- structures, where fixed-sized disk sectors or such are used to store
- the nodes.
-
-'''
-
-
-class Node(dict):
-
- '''Abstract base class for index and leaf nodes.
-
- A node may be initialized with a list of (key, value) pairs. For
- leaf nodes, the values are the actual values. For index nodes, they
- are references to other nodes.
-
- '''
-
- def __init__(self, node_id, pairs=None):
- dict.__init__(self, pairs or [])
- self.id = node_id
-
- def keys(self):
- '''Return keys in the node, sorted.'''
- return sorted(dict.keys(self))
-
- def first_key(self):
- '''Return smallest key in the node.'''
- return self.keys()[0]
-
- def pairs(self, exclude=None):
- '''Return (key, value) pairs in the node.
-
- ``exclude`` can be set to a list of keys that should be excluded
- from the list.
-
- '''
-
- if exclude is None:
- exclude = []
- return sorted((key, self[key]) for key in self if key not in exclude)
-
-
-class LeafNode(Node):
-
- '''Leaf node in the tree.
-
- A leaf node contains key/value pairs, and has no children.
-
- '''
-
- pass
-
-
-class IndexNode(Node):
-
- '''Index node in the tree.
-
- An index node contains pairs of keys and references to other nodes.
- The other nodes may be either index nodes or leaf nodes.
-
- '''
-
- def __init__(self, node_id, pairs):
- for key, child in pairs:
- assert type(key) == str
- assert type(child) == int
- Node.__init__(self, node_id, pairs)
-
- def find_key_for_child_containing(self, key):
- '''Return key for the child that contains ``key``.'''
- for k in reversed(self.keys()):
- if key >= k:
- return k
- return None
-
-
-class NodeCodec(object):
-
- '''Encode and decode nodes from their binary format.
-
- Node identifiers are assumed to fit into 64 bits.
-
- Leaf node values are assumed to fit into 65535 bytes.
-
- '''
-
- def __init__(self, key_bytes):
- self.key_bytes = key_bytes
- self.index_header_format = '!cQ'
- self.index_format = '!%dsQ' % key_bytes
- self.leaf_header_format = '!cQ'
- self.leaf_format = '!%dsH%%ds' % key_bytes
-
- self.id_size = struct.calcsize('!Q')
- self.index_header_size = struct.calcsize(self.index_header_format)
- self.index_pair_size = struct.calcsize(self.index_format)
- self.leaf_header_size = struct.calcsize(self.leaf_header_format)
-
- def leaf_size(self, pairs):
- '''Return size of a leaf node with the given pairs.'''
- return (self.leaf_header_size +
- sum(struct.calcsize(self.leaf_format % len(value))
- for key, value in pairs))
-
- def encode_leaf(self, node):
- '''Encode a leaf node as a byte string.'''
-
- parts = [struct.pack(self.leaf_header_format, 'L', node.id)]
- for key, value in node.iteritems():
- parts.append(self.format_leaf_pair(key, value))
- return ''.join(parts)
-
- def encode_index(self, node):
- '''Encode an index node as a byte string.'''
-
- parts = [struct.pack(self.index_header_format, 'I', node.id)]
- for key, child_id in node.iteritems():
- parts.append(self.format_index_pair(key, child_id))
- return ''.join(parts)
-
- def encode(self, node):
- if isinstance(node, LeafNode):
- return self.encode_leaf(node)
- else:
- return self.encode_index(node)
-
- def format_index_pair(self, key, child_id):
- return struct.pack(self.index_format, key, child_id)
-
- def format_leaf_pair(self, key, value):
- return struct.pack(self.leaf_format % len(value),
- key, len(value), value)
-
- def decode_id(self, encoded):
- '''Return leading node identifier.'''
- assert len(encoded) >= self.id_size
- (node_id,) = struct.unpack('!Q', encoded[:self.id_size])
- return node_id, encoded[self.id_size:]
-
- def decode_leaf(self, encoded):
- '''Decode a leaf node from its encoded byte string.'''
-
- assert encoded.startswith('L')
- node_id, rest = self.decode_id(encoded[1:])
-
- pairs = []
- while rest:
- n = struct.calcsize(self.leaf_format % 0)
- (key, value_size, value) = struct.unpack(self.leaf_format % 0 ,
- rest[:n])
- n += value_size
- s, rest = rest[:n], rest[n:]
- (key, value_size, value) = \
- struct.unpack(self.leaf_format % value_size, s)
- pairs.append((key, value))
-
- return LeafNode(node_id, pairs)
-
- def decode_index(self, encoded):
- '''Decode an index node from its encoded byte string.'''
-
- assert encoded.startswith('I')
- node_id, rest = self.decode_id(encoded[1:])
-
- pairs = []
- pair_size = struct.calcsize(self.index_format)
- while rest:
- s, rest = rest[:pair_size], rest[pair_size:]
- pairs.append(struct.unpack(self.index_format, s))
-
- return IndexNode(node_id, pairs)
-
- def decode(self, encoded):
- if encoded.startswith('L'):
- return self.decode_leaf(encoded)
- else:
- return self.decode_index(encoded)
-
-
-class KeySizeMismatch(Exception):
-
- def __init__(self, key, wanted_size):
- self.key = key
- self.wanted_size = wanted_size
-
- def __str__(self):
- return 'Key %s is of wrong length (%d, should be %d)' % \
- (repr(self.key), len(self.key), self.wanted_size)
-
-
-class BTree(object):
-
- '''B-tree.
-
- The nodes are stored in an external node store; see the NodeStore
- class. Key sizes are fixed, and given in bytes.
-
- The tree is balanced.
-
- Three basic operations are available to the tree: lookup, insert, and
- remove.
-
- '''
-
- def __init__(self, node_store, key_bytes):
- self.node_store = node_store
- self.codec = NodeCodec(key_bytes)
-
- max_pairs = ((self.node_store.node_size - self.codec.index_header_size)
- / self.codec.index_pair_size)
- self.min_index_length = max_pairs / 2
- self.max_index_length = max_pairs
-
- self.last_id = 0
- if not self.read_metadata():
- self.new_root([])
-
- def read_metadata(self):
- blob = self.node_store.get_metadata()
- if blob:
- (self.last_id,) = struct.unpack('!Q', blob)
- return True
- else:
- return False
-
- def store_metadata(self):
- blob = struct.pack('!Q', self.last_id)
- self.node_store.set_metadata(blob)
-
- def check_key_size(self, key):
- if len(key) != self.codec.key_bytes:
- raise KeySizeMismatch(key, self.codec.key_bytes)
-
- def new_id(self):
- '''Generate a new node identifier.'''
- self.last_id += 1
- return self.last_id
-
- def new_leaf(self, pairs):
- '''Create a new leaf node and keep track of it.'''
- leaf = LeafNode(self.new_id(), pairs)
- self.node_store.put_node(leaf.id, self.codec.encode(leaf))
- self.store_metadata()
- return leaf
-
- def new_index(self, pairs):
- '''Create a new index node and keep track of it.'''
- index = IndexNode(self.new_id(), pairs)
- self.node_store.put_node(index.id, self.codec.encode(index))
- self.store_metadata()
- return index
-
- def new_root(self, pairs):
- '''Create a new root node and keep track of it.'''
- root = IndexNode(0, pairs)
- self.node_store.put_node(root.id, self.codec.encode(root))
- self.store_metadata()
-
- def get_node(self, node_id):
- '''Return node corresponding to a node id.'''
- encoded = self.node_store.get_node(node_id)
- return self.codec.decode(encoded)
-
- @property
- def root(self):
- '''Return the root node.'''
- return self.get_node(0)
-
- def lookup(self, key):
- '''Return value corresponding to ``key``.
-
- If the key is not in the tree, raise ``KeyError``.
-
- '''
-
- self.check_key_size(key)
- return self._lookup(self.root.id, key)
-
- def _lookup(self, node_id, key):
- node = self.get_node(node_id)
- if isinstance(node, LeafNode):
- return node[key]
- else:
- k = node.find_key_for_child_containing(key)
- if k is None:
- raise KeyError(key)
- else:
- return self._lookup(node[k], key)
-
- def insert(self, key, value):
- '''Insert a new key/value pair into the tree.
-
- If the key already existed in the tree, the old value is silently
- forgotten.
-
- '''
-
- self.check_key_size(key)
- a, b = self._insert(self.root.id, key, value)
- if b is None:
- self.new_root(a.pairs())
- else:
- self.new_root([(a.first_key(), a.id), (b.first_key(), b.id)])
-
- def _insert(self, node_id, key, value):
- node = self.get_node(node_id)
- if isinstance(node, LeafNode):
- return self._insert_into_leaf(node_id, key, value)
- elif len(node) == 0:
- return self._insert_into_empty_root(key, value)
- elif len(node) == self.max_index_length:
- return self._insert_into_full_index(node_id, key, value)
- else:
- return self._insert_into_nonfull_index(node_id, key, value)
-
- def _insert_into_leaf(self, leaf_id, key, value):
- leaf = self.get_node(leaf_id)
- pairs = sorted(leaf.pairs(exclude=[key]) + [(key, value)])
- if self.codec.leaf_size(pairs) <= self.node_store.node_size:
- return self.new_leaf(pairs), None
- else:
- n = len(pairs) / 2
- leaf1 = self.new_leaf(pairs[:n])
- leaf2 = self.new_leaf(pairs[n:])
- return leaf1, leaf2
-
- def _insert_into_empty_root(self, key, value):
- leaf = self.new_leaf([(key, value)])
- return self.new_index([(leaf.first_key(), leaf.id)]), None
-
- def _insert_into_full_index(self, node_id, key, value):
- # A full index node needs to be split, then key/value inserted into
- # one of the halves.
- node = self.get_node(node_id)
- pairs = node.pairs()
- n = len(pairs) / 2
- node1 = self.new_index(pairs[:n])
- node2 = self.new_index(pairs[n:])
- if key < node2.first_key():
- a, b = self._insert(node1.id, key, value)
- assert b is None
- return a, node2
- else:
- a, b = self._insert(node2.id, key, value)
- assert b is None
- return node1, a
-
- def _insert_into_nonfull_index(self, node_id, key, value):
- # Insert into correct child, get up to two replacements for
- # that child.
-
- node = self.get_node(node_id)
- k = node.find_key_for_child_containing(key)
- if k is None:
- k = node.first_key()
-
- child = self.get_node(node[k])
- a, b = self._insert(child.id, key, value)
- assert a is not None
- pairs = node.pairs(exclude=[k])
- pairs += [(a.first_key(), a.id)]
- if b is not None:
- pairs += [(b.first_key(), b.id)]
- pairs.sort()
- assert len(pairs) <= self.max_index_length
- return self.new_index(pairs), None
-
- def remove(self, key):
- '''Remove ``key`` and its associated value from tree.
-
- If key is not in the tree, ``KeyValue`` is raised.
-
- '''
-
- self.check_key_size(key)
- a = self._remove(self.root.id, key)
- if a is None:
- self.new_root([])
- else:
- self.new_root(a.pairs())
-
- def _remove(self, node_id, key):
- node = self.get_node(node_id)
- if isinstance(node, LeafNode):
- return self._remove_from_leaf(node_id, key)
- else:
- k = node.find_key_for_child_containing(key)
- if k is None:
- raise KeyError(key)
- elif len(self.get_node(node[k])) <= self.min_index_length:
- return self._remove_from_minimal_index(node_id, key, k)
- else:
- return self._remove_from_nonminimal_index(node_id, key, k)
-
- def _remove_from_leaf(self, node_id, key):
- node = self.get_node(node_id)
- if key in node:
- pairs = node.pairs(exclude=[key])
- if pairs:
- return self.new_leaf(pairs)
- else:
- return None
- else:
- raise KeyError(key)
-
- def _merge(self, id1, id2):
- n1 = self.get_node(id1)
- n2 = self.get_node(id2)
- if isinstance(n1, IndexNode):
- assert isinstance(n2, IndexNode)
- return self.new_index(n1.pairs() + n2.pairs())
- else:
- assert isinstance(n1, LeafNode)
- assert isinstance(n2, LeafNode)
- return self.new_leaf(n1.pairs() + n2.pairs())
-
- def _remove_from_minimal_index(self, node_id, key, child_key):
- node = self.get_node(node_id)
- exclude = [child_key]
- new_ones = []
- child = self._remove(node[child_key], key)
-
- if child is not None:
- keys = node.keys()
- i = keys.index(child_key)
-
- # If possible, merge with left or right sibling.
- if (i > 0 and
- len(self.get_node(node[keys[i-1]])) < self.max_index_length):
- new_ones.append(self._merge(node[keys[i-1]], child.id))
- exclude.append(keys[i-1])
- elif (i+1 < len(keys) and
- len(self.get_node(node[keys[i+1]])) < self.max_index_length):
- new_ones.append(self._merge(node[keys[i+1]], child.id))
- exclude.append(keys[i+1])
- else:
- new_ones.append(child)
-
- others = node.pairs(exclude=exclude)
- if others + new_ones:
- return self.new_index(others +
- [(n.first_key(), n.id) for n in new_ones])
- else:
- return None
-
- def _remove_from_nonminimal_index(self, node_id, key, child_key):
- node = self.get_node(node_id)
- child = self._remove(node[child_key], key)
- pairs = node.pairs(exclude=[child_key])
- if child is not None:
- pairs += [(child.first_key(), child.id)]
- pairs.sort()
- assert pairs
- return self.new_index(pairs)
-
-
-class NodeMissing(Exception): # pragma: no cover
-
- '''A node cannot be found from a NodeStore.'''
-
- def __init__(self, node_id):
- self.node_id = node_id
-
- def __str__(self):
- return 'Node %d cannot be found in the node store' % self.node_id
-
-
-class NodeTooBig(Exception): # pragma: no cover
-
- '''User tried to put a node that was too big into the store.'''
-
- def __init__(self, node_id, node_size):
- self.node_id = node_id
- self.node_size = node_size
-
- def __str__(self):
- return 'Node %d is too big (%d bytes)' % (self.node_id, self.node_size)
-
-
-class NodeExists(Exception): # pragma: no cover
-
- '''User tried to put a node that already exists in the store.'''
-
- def __init__(self, node_id):
- self.node_id = node_id
-
- def __str__(self):
- return 'Node %d is already in the store' % self.node_id
-
-
-class NodeStore(object): # pragma: no cover
-
- '''Abstract base class for storing nodes externally.
-
- The BTree class itself does not handle external storage of nodes.
- Instead, it is given an object that implements the API in this
- class. An actual implementation might keep nodes in memory, or
- store them on disk using a filesystem, or a database.
-
- Node stores deal with nodes as byte strings: the BTree class
- encodes them before handing them to the store, and decodes them
- when it gets them from the store.
-
- Each node has an identifier that is unique within the tree.
- The identifier is an integer, and the BTree makes the following
- guarantees about it:
-
- * it is a non-negative integer
- * new nodes are assigned the next consecutive one
- * it is never re-used
-
- Further, the BTree makes the following guarantees about the encoded
- nodes:
-
- * they have a strict upper size limit
- * the tree attempts to fill nodes as close to the limit as possible
-
- The size limit is given to the node store at initialization time.
- It is accessible via the node_size property. Implementations of
- this API must handle that in some suitable way, preferably by
- inheriting from this class and calling its initializer.
-
- '''
-
- def __init__(self, node_size):
- self.node_size = node_size
-
- def set_metadata(self, blob):
- '''Set metadata as a blob.
-
- The blob must fit into a node.
-
- '''
-
- def get_metadata(self):
- '''Return metadata as a blob.'''
-
- def put_node(self, node_id, encoded_node):
- '''Put a new node into the store.'''
-
- def get_node(self, node_id):
- '''Return a node from the store.
-
- Raise the NodeMissing exception if the node is not in the
- store (has never been, or has been removed). Raise other
- errors as suitable.
-
- '''
-
- def remove_node(self, node_id):
- '''Remove a node from the store.'''
-
- def list_nodes(self):
- '''Return list of ids of all nodes in store.'''
-
-
-class NodeStoreTests(object): # pragma: no cover
-
- '''Re-useable tests for NodeStore implementations.
-
- The NodeStore base class can't be usefully instantiated itself.
- Instead you are supposed to sub-class it and implement the API in
- a suitable way for yourself.
-
- This class implements a number of tests that the API implementation
- must pass. The implementation's own test class should inherit from
- this class, and unittest.TestCase.
-
- The test sub-class should define a setUp method that sets the following:
-
- * self.ns to an instance of the API implementation sub-class
- * self.node_size to the node size
-
- '''
-
- def test_sets_node_size(self):
- self.assertEqual(self.ns.node_size, self.node_size)
-
- def test_has_no_metadata_initially(self):
- self.assertEqual(self.ns.get_metadata(), '')
-
- def test_sets_metadata(self):
- self.ns.set_metadata('foo')
- self.assertEqual(self.ns.get_metadata(), 'foo')
-
- def test_has_no_node_zero_initially(self):
- self.assertRaises(NodeMissing, self.ns.get_node, 0)
-
- def test_lists_no_nodes_initially(self):
- self.assertEqual(self.ns.list_nodes(), [])
-
- def test_puts_and_gets_same(self):
- encoded = 'asdfadsfafd'
- self.ns.put_node(0, encoded)
- self.assertEqual(self.ns.get_node(0), encoded)
-
- def test_removes_node(self):
- encoded = 'asdfafdafd'
- self.ns.put_node(0, encoded)
- self.ns.remove_node(0)
- self.assertRaises(NodeMissing, self.ns.get_node, 0)
- self.assertEqual(self.ns.list_nodes(), [])
-
- def test_lists_node_zero(self):
- encoded = 'adsfafdafd'
- self.ns.put_node(0, encoded)
- self.assertEqual(self.ns.list_nodes(), [0])
-
- def test_put_refuses_too_large_a_node(self):
- self.assertRaises(NodeTooBig, self.ns.put_node, 0,
- 'x' * (self.node_size + 1))
-
- def test_put_refuses_to_overwrite_a_node(self):
- encoded = 'x'
- self.ns.put_node(1, encoded)
- self.assertRaises(NodeExists, self.ns.put_node, 1, encoded)
-
- def test_put_allows_overwrite_of_node_zero(self):
- self.ns.put_node(0, 'foo')
- self.ns.put_node(0, 'bar')
- self.assertEqual(self.ns.get_node(0), 'bar')
-
- def test_remove_raises_nodemissing_if_node_does_not_exist(self):
- self.assertRaises(NodeMissing, self.ns.remove_node, 0)
diff --git a/btree/codec.py b/btree/codec.py
new file mode 100644
index 0000000..01ecaa5
--- /dev/null
+++ b/btree/codec.py
@@ -0,0 +1,107 @@
+import struct
+
+import btree
+
+
+class NodeCodec(object):
+
+ '''Encode and decode nodes from their binary format.
+
+ Node identifiers are assumed to fit into 64 bits.
+
+ Leaf node values are assumed to fit into 65535 bytes.
+
+ '''
+
+ def __init__(self, key_bytes):
+ self.key_bytes = key_bytes
+ self.index_header_format = '!cQ'
+ self.index_format = '!%dsQ' % key_bytes
+ self.leaf_header_format = '!cQ'
+ self.leaf_format = '!%dsH%%ds' % key_bytes
+
+ self.id_size = struct.calcsize('!Q')
+ self.index_header_size = struct.calcsize(self.index_header_format)
+ self.index_pair_size = struct.calcsize(self.index_format)
+ self.leaf_header_size = struct.calcsize(self.leaf_header_format)
+
+ def leaf_size(self, pairs):
+ '''Return size of a leaf node with the given pairs.'''
+ return (self.leaf_header_size +
+ sum(struct.calcsize(self.leaf_format % len(value))
+ for key, value in pairs))
+
+ def encode_leaf(self, node):
+ '''Encode a leaf node as a byte string.'''
+
+ parts = [struct.pack(self.leaf_header_format, 'L', node.id)]
+ for key, value in node.iteritems():
+ parts.append(self.format_leaf_pair(key, value))
+ return ''.join(parts)
+
+ def encode_index(self, node):
+ '''Encode an index node as a byte string.'''
+
+ parts = [struct.pack(self.index_header_format, 'I', node.id)]
+ for key, child_id in node.iteritems():
+ parts.append(self.format_index_pair(key, child_id))
+ return ''.join(parts)
+
+ def encode(self, node):
+ if isinstance(node, btree.LeafNode):
+ return self.encode_leaf(node)
+ else:
+ return self.encode_index(node)
+
+ def format_index_pair(self, key, child_id):
+ return struct.pack(self.index_format, key, child_id)
+
+ def format_leaf_pair(self, key, value):
+ return struct.pack(self.leaf_format % len(value),
+ key, len(value), value)
+
+ def decode_id(self, encoded):
+ '''Return leading node identifier.'''
+ assert len(encoded) >= self.id_size
+ (node_id,) = struct.unpack('!Q', encoded[:self.id_size])
+ return node_id, encoded[self.id_size:]
+
+ def decode_leaf(self, encoded):
+ '''Decode a leaf node from its encoded byte string.'''
+
+ assert encoded.startswith('L')
+ node_id, rest = self.decode_id(encoded[1:])
+
+ pairs = []
+ while rest:
+ n = struct.calcsize(self.leaf_format % 0)
+ (key, value_size, value) = struct.unpack(self.leaf_format % 0 ,
+ rest[:n])
+ n += value_size
+ s, rest = rest[:n], rest[n:]
+ (key, value_size, value) = \
+ struct.unpack(self.leaf_format % value_size, s)
+ pairs.append((key, value))
+
+ return btree.LeafNode(node_id, pairs)
+
+ def decode_index(self, encoded):
+ '''Decode an index node from its encoded byte string.'''
+
+ assert encoded.startswith('I')
+ node_id, rest = self.decode_id(encoded[1:])
+
+ pairs = []
+ pair_size = struct.calcsize(self.index_format)
+ while rest:
+ s, rest = rest[:pair_size], rest[pair_size:]
+ pairs.append(struct.unpack(self.index_format, s))
+
+ return btree.IndexNode(node_id, pairs)
+
+ def decode(self, encoded):
+ if encoded.startswith('L'):
+ return self.decode_leaf(encoded)
+ else:
+ return self.decode_index(encoded)
+
diff --git a/btree/codec_tests.py b/btree/codec_tests.py
new file mode 100644
index 0000000..92b5e15
--- /dev/null
+++ b/btree/codec_tests.py
@@ -0,0 +1,22 @@
+import unittest
+
+import btree
+
+
+class NodeCodecTests(unittest.TestCase):
+
+ def setUp(self):
+ self.leaf = btree.LeafNode(1234, [('foo', 'bar')])
+ self.index = btree.IndexNode(5678,
+ [('bar', 1234),
+ ('foo', 7890)])
+ self.codec = btree.NodeCodec(3)
+
+ def test_leaf_round_trip_ok(self):
+ encoded = self.codec.encode_leaf(self.leaf)
+ self.assertEqual(self.codec.decode_leaf(encoded), self.leaf)
+
+ def test_index_round_trip_ok(self):
+ encoded = self.codec.encode_index(self.index)
+ self.assertEqual(self.codec.decode_index(encoded), self.index)
+
diff --git a/btree/nodes.py b/btree/nodes.py
new file mode 100644
index 0000000..2234e5e
--- /dev/null
+++ b/btree/nodes.py
@@ -0,0 +1,68 @@
+class Node(dict):
+
+ '''Abstract base class for index and leaf nodes.
+
+ A node may be initialized with a list of (key, value) pairs. For
+ leaf nodes, the values are the actual values. For index nodes, they
+ are references to other nodes.
+
+ '''
+
+ def __init__(self, node_id, pairs=None):
+ dict.__init__(self, pairs or [])
+ self.id = node_id
+
+ def keys(self):
+ '''Return keys in the node, sorted.'''
+ return sorted(dict.keys(self))
+
+ def first_key(self):
+ '''Return smallest key in the node.'''
+ return self.keys()[0]
+
+ def pairs(self, exclude=None):
+ '''Return (key, value) pairs in the node.
+
+ ``exclude`` can be set to a list of keys that should be excluded
+ from the list.
+
+ '''
+
+ if exclude is None:
+ exclude = []
+ return sorted((key, self[key]) for key in self if key not in exclude)
+
+
+class LeafNode(Node):
+
+ '''Leaf node in the tree.
+
+ A leaf node contains key/value pairs, and has no children.
+
+ '''
+
+ pass
+
+
+class IndexNode(Node):
+
+ '''Index node in the tree.
+
+ An index node contains pairs of keys and references to other nodes.
+ The other nodes may be either index nodes or leaf nodes.
+
+ '''
+
+ def __init__(self, node_id, pairs):
+ for key, child in pairs:
+ assert type(key) == str
+ assert type(child) == int
+ Node.__init__(self, node_id, pairs)
+
+ def find_key_for_child_containing(self, key):
+ '''Return key for the child that contains ``key``.'''
+ for k in reversed(self.keys()):
+ if key >= k:
+ return k
+ return None
+
diff --git a/btree/nodes_tests.py b/btree/nodes_tests.py
new file mode 100644
index 0000000..6d4d147
--- /dev/null
+++ b/btree/nodes_tests.py
@@ -0,0 +1,49 @@
+import unittest
+
+import btree
+
+
+class LeafNodeTests(unittest.TestCase):
+
+ def setUp(self):
+ self.node_id = 12765
+ self.leaf = btree.LeafNode(self.node_id, [('foo', 'bar')])
+
+ def test_has_id(self):
+ self.assertEqual(self.leaf.id, self.node_id)
+
+ def test_has_keys(self):
+ self.assertEqual(self.leaf.keys(), ['foo'])
+
+ def test_has_value(self):
+ self.assertEqual(self.leaf['foo'], 'bar')
+
+ def test_sorts_keys(self):
+ leaf = btree.LeafNode(0, [('foo', 'foo'), ('bar', 'bar')])
+ self.assertEqual(leaf.keys(), sorted(['foo', 'bar']))
+
+
+class IndexNodeTests(unittest.TestCase):
+
+ def setUp(self):
+ self.leaf1 = btree.LeafNode(0, [('bar', 'bar')])
+ self.leaf2 = btree.LeafNode(1, [('foo', 'foo')])
+ self.index_id = 1234
+ self.index = btree.IndexNode(self.index_id,
+ [('bar', self.leaf1.id),
+ ('foo', self.leaf2.id)])
+
+ def test_has_id(self):
+ self.assertEqual(self.index.id, self.index_id)
+
+ def test_has_keys(self):
+ self.assertEqual(self.index.keys(), ['bar', 'foo'])
+
+ def test_has_children(self):
+ self.assertEqual(sorted(self.index.values()),
+ sorted([self.leaf1.id, self.leaf2.id]))
+
+ def test_has_indexed_children(self):
+ self.assertEqual(self.index['bar'], self.leaf1.id)
+ self.assertEqual(self.index['foo'], self.leaf2.id)
+
diff --git a/btree/nodestore.py b/btree/nodestore.py
new file mode 100644
index 0000000..3f1dac5
--- /dev/null
+++ b/btree/nodestore.py
@@ -0,0 +1,168 @@
+class NodeMissing(Exception): # pragma: no cover
+
+ '''A node cannot be found from a NodeStore.'''
+
+ def __init__(self, node_id):
+ self.node_id = node_id
+
+ def __str__(self):
+ return 'Node %d cannot be found in the node store' % self.node_id
+
+
+class NodeTooBig(Exception): # pragma: no cover
+
+ '''User tried to put a node that was too big into the store.'''
+
+ def __init__(self, node_id, node_size):
+ self.node_id = node_id
+ self.node_size = node_size
+
+ def __str__(self):
+ return 'Node %d is too big (%d bytes)' % (self.node_id, self.node_size)
+
+
+class NodeExists(Exception): # pragma: no cover
+
+ '''User tried to put a node that already exists in the store.'''
+
+ def __init__(self, node_id):
+ self.node_id = node_id
+
+ def __str__(self):
+ return 'Node %d is already in the store' % self.node_id
+
+
+class NodeStore(object): # pragma: no cover
+
+ '''Abstract base class for storing nodes externally.
+
+ The BTree class itself does not handle external storage of nodes.
+ Instead, it is given an object that implements the API in this
+ class. An actual implementation might keep nodes in memory, or
+ store them on disk using a filesystem, or a database.
+
+ Node stores deal with nodes as byte strings: the BTree class
+ encodes them before handing them to the store, and decodes them
+ when it gets them from the store.
+
+ Each node has an identifier that is unique within the tree.
+ The identifier is an integer, and the BTree makes the following
+ guarantees about it:
+
+ * it is a non-negative integer
+ * new nodes are assigned the next consecutive one
+ * it is never re-used
+
+ Further, the BTree makes the following guarantees about the encoded
+ nodes:
+
+ * they have a strict upper size limit
+ * the tree attempts to fill nodes as close to the limit as possible
+
+ The size limit is given to the node store at initialization time.
+ It is accessible via the node_size property. Implementations of
+ this API must handle that in some suitable way, preferably by
+ inheriting from this class and calling its initializer.
+
+ '''
+
+ def __init__(self, node_size):
+ self.node_size = node_size
+
+ def set_metadata(self, blob):
+ '''Set metadata as a blob.
+
+ The blob must fit into a node.
+
+ '''
+
+ def get_metadata(self):
+ '''Return metadata as a blob.'''
+
+ def put_node(self, node_id, encoded_node):
+ '''Put a new node into the store.'''
+
+ def get_node(self, node_id):
+ '''Return a node from the store.
+
+ Raise the NodeMissing exception if the node is not in the
+ store (has never been, or has been removed). Raise other
+ errors as suitable.
+
+ '''
+
+ def remove_node(self, node_id):
+ '''Remove a node from the store.'''
+
+ def list_nodes(self):
+ '''Return list of ids of all nodes in store.'''
+
+
+class NodeStoreTests(object): # pragma: no cover
+
+ '''Re-useable tests for NodeStore implementations.
+
+ The NodeStore base class can't be usefully instantiated itself.
+ Instead you are supposed to sub-class it and implement the API in
+ a suitable way for yourself.
+
+ This class implements a number of tests that the API implementation
+ must pass. The implementation's own test class should inherit from
+ this class, and unittest.TestCase.
+
+ The test sub-class should define a setUp method that sets the following:
+
+ * self.ns to an instance of the API implementation sub-class
+ * self.node_size to the node size
+
+ '''
+
+ def test_sets_node_size(self):
+ self.assertEqual(self.ns.node_size, self.node_size)
+
+ def test_has_no_metadata_initially(self):
+ self.assertEqual(self.ns.get_metadata(), '')
+
+ def test_sets_metadata(self):
+ self.ns.set_metadata('foo')
+ self.assertEqual(self.ns.get_metadata(), 'foo')
+
+ def test_has_no_node_zero_initially(self):
+ self.assertRaises(NodeMissing, self.ns.get_node, 0)
+
+ def test_lists_no_nodes_initially(self):
+ self.assertEqual(self.ns.list_nodes(), [])
+
+ def test_puts_and_gets_same(self):
+ encoded = 'asdfadsfafd'
+ self.ns.put_node(0, encoded)
+ self.assertEqual(self.ns.get_node(0), encoded)
+
+ def test_removes_node(self):
+ encoded = 'asdfafdafd'
+ self.ns.put_node(0, encoded)
+ self.ns.remove_node(0)
+ self.assertRaises(NodeMissing, self.ns.get_node, 0)
+ self.assertEqual(self.ns.list_nodes(), [])
+
+ def test_lists_node_zero(self):
+ encoded = 'adsfafdafd'
+ self.ns.put_node(0, encoded)
+ self.assertEqual(self.ns.list_nodes(), [0])
+
+ def test_put_refuses_too_large_a_node(self):
+ self.assertRaises(NodeTooBig, self.ns.put_node, 0,
+ 'x' * (self.node_size + 1))
+
+ def test_put_refuses_to_overwrite_a_node(self):
+ encoded = 'x'
+ self.ns.put_node(1, encoded)
+ self.assertRaises(NodeExists, self.ns.put_node, 1, encoded)
+
+ def test_put_allows_overwrite_of_node_zero(self):
+ self.ns.put_node(0, 'foo')
+ self.ns.put_node(0, 'bar')
+ self.assertEqual(self.ns.get_node(0), 'bar')
+
+ def test_remove_raises_nodemissing_if_node_does_not_exist(self):
+ self.assertRaises(NodeMissing, self.ns.remove_node, 0)
diff --git a/btree/tree.py b/btree/tree.py
new file mode 100644
index 0000000..75924d6
--- /dev/null
+++ b/btree/tree.py
@@ -0,0 +1,297 @@
+import struct
+
+import btree
+
+
+'''A simple B-tree implementation.
+
+Some notes:
+
+* No nodes are modified, everything is done copy-on-write. This is because
+ eventually this code will be used to handle on-disk data structures where
+ copy-on-write is essential.
+* The fullness of leaf and index nodes is determined by number of keys.
+ This is appropriate for now, but eventually we will want to inspect the
+ size in bytes of the nodes instead. This is also for on-disk data
+ structures, where fixed-sized disk sectors or such are used to store
+ the nodes.
+
+'''
+
+
+class KeySizeMismatch(Exception):
+
+ def __init__(self, key, wanted_size):
+ self.key = key
+ self.wanted_size = wanted_size
+
+ def __str__(self):
+ return 'Key %s is of wrong length (%d, should be %d)' % \
+ (repr(self.key), len(self.key), self.wanted_size)
+
+
+class BTree(object):
+
+ '''B-tree.
+
+ The nodes are stored in an external node store; see the NodeStore
+ class. Key sizes are fixed, and given in bytes.
+
+ The tree is balanced.
+
+ Three basic operations are available to the tree: lookup, insert, and
+ remove.
+
+ '''
+
+ def __init__(self, node_store, key_bytes):
+ self.node_store = node_store
+ self.codec = btree.NodeCodec(key_bytes)
+
+ max_pairs = ((self.node_store.node_size - self.codec.index_header_size)
+ / self.codec.index_pair_size)
+ self.min_index_length = max_pairs / 2
+ self.max_index_length = max_pairs
+
+ self.last_id = 0
+ if not self.read_metadata():
+ self.new_root([])
+
+ def read_metadata(self):
+ blob = self.node_store.get_metadata()
+ if blob:
+ (self.last_id,) = struct.unpack('!Q', blob)
+ return True
+ else:
+ return False
+
+ def store_metadata(self):
+ blob = struct.pack('!Q', self.last_id)
+ self.node_store.set_metadata(blob)
+
+ def check_key_size(self, key):
+ if len(key) != self.codec.key_bytes:
+ raise KeySizeMismatch(key, self.codec.key_bytes)
+
+ def new_id(self):
+ '''Generate a new node identifier.'''
+ self.last_id += 1
+ return self.last_id
+
+ def new_leaf(self, pairs):
+ '''Create a new leaf node and keep track of it.'''
+ leaf = btree.LeafNode(self.new_id(), pairs)
+ self.node_store.put_node(leaf.id, self.codec.encode(leaf))
+ self.store_metadata()
+ return leaf
+
+ def new_index(self, pairs):
+ '''Create a new index node and keep track of it.'''
+ index = btree.IndexNode(self.new_id(), pairs)
+ self.node_store.put_node(index.id, self.codec.encode(index))
+ self.store_metadata()
+ return index
+
+ def new_root(self, pairs):
+ '''Create a new root node and keep track of it.'''
+ root = btree.IndexNode(0, pairs)
+ self.node_store.put_node(root.id, self.codec.encode(root))
+ self.store_metadata()
+
+ def get_node(self, node_id):
+ '''Return node corresponding to a node id.'''
+ encoded = self.node_store.get_node(node_id)
+ return self.codec.decode(encoded)
+
+ @property
+ def root(self):
+ '''Return the root node.'''
+ return self.get_node(0)
+
+ def lookup(self, key):
+ '''Return value corresponding to ``key``.
+
+ If the key is not in the tree, raise ``KeyError``.
+
+ '''
+
+ self.check_key_size(key)
+ return self._lookup(self.root.id, key)
+
+ def _lookup(self, node_id, key):
+ node = self.get_node(node_id)
+ if isinstance(node, btree.LeafNode):
+ return node[key]
+ else:
+ k = node.find_key_for_child_containing(key)
+ if k is None:
+ raise KeyError(key)
+ else:
+ return self._lookup(node[k], key)
+
+ def insert(self, key, value):
+ '''Insert a new key/value pair into the tree.
+
+ If the key already existed in the tree, the old value is silently
+ forgotten.
+
+ '''
+
+ self.check_key_size(key)
+ a, b = self._insert(self.root.id, key, value)
+ if b is None:
+ self.new_root(a.pairs())
+ else:
+ self.new_root([(a.first_key(), a.id), (b.first_key(), b.id)])
+
+ def _insert(self, node_id, key, value):
+ node = self.get_node(node_id)
+ if isinstance(node, btree.LeafNode):
+ return self._insert_into_leaf(node_id, key, value)
+ elif len(node) == 0:
+ return self._insert_into_empty_root(key, value)
+ elif len(node) == self.max_index_length:
+ return self._insert_into_full_index(node_id, key, value)
+ else:
+ return self._insert_into_nonfull_index(node_id, key, value)
+
+ def _insert_into_leaf(self, leaf_id, key, value):
+ leaf = self.get_node(leaf_id)
+ pairs = sorted(leaf.pairs(exclude=[key]) + [(key, value)])
+ if self.codec.leaf_size(pairs) <= self.node_store.node_size:
+ return self.new_leaf(pairs), None
+ else:
+ n = len(pairs) / 2
+ leaf1 = self.new_leaf(pairs[:n])
+ leaf2 = self.new_leaf(pairs[n:])
+ return leaf1, leaf2
+
+ def _insert_into_empty_root(self, key, value):
+ leaf = self.new_leaf([(key, value)])
+ return self.new_index([(leaf.first_key(), leaf.id)]), None
+
+ def _insert_into_full_index(self, node_id, key, value):
+ # A full index node needs to be split, then key/value inserted into
+ # one of the halves.
+ node = self.get_node(node_id)
+ pairs = node.pairs()
+ n = len(pairs) / 2
+ node1 = self.new_index(pairs[:n])
+ node2 = self.new_index(pairs[n:])
+ if key < node2.first_key():
+ a, b = self._insert(node1.id, key, value)
+ assert b is None
+ return a, node2
+ else:
+ a, b = self._insert(node2.id, key, value)
+ assert b is None
+ return node1, a
+
+ def _insert_into_nonfull_index(self, node_id, key, value):
+ # Insert into correct child, get up to two replacements for
+ # that child.
+
+ node = self.get_node(node_id)
+ k = node.find_key_for_child_containing(key)
+ if k is None:
+ k = node.first_key()
+
+ child = self.get_node(node[k])
+ a, b = self._insert(child.id, key, value)
+ assert a is not None
+ pairs = node.pairs(exclude=[k])
+ pairs += [(a.first_key(), a.id)]
+ if b is not None:
+ pairs += [(b.first_key(), b.id)]
+ pairs.sort()
+ assert len(pairs) <= self.max_index_length
+ return self.new_index(pairs), None
+
+ def remove(self, key):
+ '''Remove ``key`` and its associated value from tree.
+
+ If key is not in the tree, ``KeyValue`` is raised.
+
+ '''
+
+ self.check_key_size(key)
+ a = self._remove(self.root.id, key)
+ if a is None:
+ self.new_root([])
+ else:
+ self.new_root(a.pairs())
+
+ def _remove(self, node_id, key):
+ node = self.get_node(node_id)
+ if isinstance(node, btree.LeafNode):
+ return self._remove_from_leaf(node_id, key)
+ else:
+ k = node.find_key_for_child_containing(key)
+ if k is None:
+ raise KeyError(key)
+ elif len(self.get_node(node[k])) <= self.min_index_length:
+ return self._remove_from_minimal_index(node_id, key, k)
+ else:
+ return self._remove_from_nonminimal_index(node_id, key, k)
+
+ def _remove_from_leaf(self, node_id, key):
+ node = self.get_node(node_id)
+ if key in node:
+ pairs = node.pairs(exclude=[key])
+ if pairs:
+ return self.new_leaf(pairs)
+ else:
+ return None
+ else:
+ raise KeyError(key)
+
+ def _merge(self, id1, id2):
+ n1 = self.get_node(id1)
+ n2 = self.get_node(id2)
+ if isinstance(n1, btree.IndexNode):
+ assert isinstance(n2, btree.IndexNode)
+ return self.new_index(n1.pairs() + n2.pairs())
+ else:
+ assert isinstance(n1, btree.LeafNode)
+ assert isinstance(n2, btree.LeafNode)
+ return self.new_leaf(n1.pairs() + n2.pairs())
+
+ def _remove_from_minimal_index(self, node_id, key, child_key):
+ node = self.get_node(node_id)
+ exclude = [child_key]
+ new_ones = []
+ child = self._remove(node[child_key], key)
+
+ if child is not None:
+ keys = node.keys()
+ i = keys.index(child_key)
+
+ # If possible, merge with left or right sibling.
+ if (i > 0 and
+ len(self.get_node(node[keys[i-1]])) < self.max_index_length):
+ new_ones.append(self._merge(node[keys[i-1]], child.id))
+ exclude.append(keys[i-1])
+ elif (i+1 < len(keys) and
+ len(self.get_node(node[keys[i+1]])) < self.max_index_length):
+ new_ones.append(self._merge(node[keys[i+1]], child.id))
+ exclude.append(keys[i+1])
+ else:
+ new_ones.append(child)
+
+ others = node.pairs(exclude=exclude)
+ if others + new_ones:
+ return self.new_index(others +
+ [(n.first_key(), n.id) for n in new_ones])
+ else:
+ return None
+
+ def _remove_from_nonminimal_index(self, node_id, key, child_key):
+ node = self.get_node(node_id)
+ child = self._remove(node[child_key], key)
+ pairs = node.pairs(exclude=[child_key])
+ if child is not None:
+ pairs += [(child.first_key(), child.id)]
+ pairs.sort()
+ assert pairs
+ return self.new_index(pairs)
+
diff --git a/btree/__init___tests.py b/btree/tree_tests.py
index f6f036c..16efa8c 100644
--- a/btree/__init___tests.py
+++ b/btree/tree_tests.py
@@ -28,69 +28,6 @@ class DummyNodeStore(object):
return self.nodes.keys()
-class LeafNodeTests(unittest.TestCase):
-
- def setUp(self):
- self.node_id = 12765
- self.leaf = btree.LeafNode(self.node_id, [('foo', 'bar')])
-
- def test_has_id(self):
- self.assertEqual(self.leaf.id, self.node_id)
-
- def test_has_keys(self):
- self.assertEqual(self.leaf.keys(), ['foo'])
-
- def test_has_value(self):
- self.assertEqual(self.leaf['foo'], 'bar')
-
- def test_sorts_keys(self):
- leaf = btree.LeafNode(0, [('foo', 'foo'), ('bar', 'bar')])
- self.assertEqual(leaf.keys(), sorted(['foo', 'bar']))
-
-
-class IndexNodeTests(unittest.TestCase):
-
- def setUp(self):
- self.leaf1 = btree.LeafNode(0, [('bar', 'bar')])
- self.leaf2 = btree.LeafNode(1, [('foo', 'foo')])
- self.index_id = 1234
- self.index = btree.IndexNode(self.index_id,
- [('bar', self.leaf1.id),
- ('foo', self.leaf2.id)])
-
- def test_has_id(self):
- self.assertEqual(self.index.id, self.index_id)
-
- def test_has_keys(self):
- self.assertEqual(self.index.keys(), ['bar', 'foo'])
-
- def test_has_children(self):
- self.assertEqual(sorted(self.index.values()),
- sorted([self.leaf1.id, self.leaf2.id]))
-
- def test_has_indexed_children(self):
- self.assertEqual(self.index['bar'], self.leaf1.id)
- self.assertEqual(self.index['foo'], self.leaf2.id)
-
-
-class NodeCodecTests(unittest.TestCase):
-
- def setUp(self):
- self.leaf = btree.LeafNode(1234, [('foo', 'bar')])
- self.index = btree.IndexNode(5678,
- [('bar', 1234),
- ('foo', 7890)])
- self.codec = btree.NodeCodec(3)
-
- def test_leaf_round_trip_ok(self):
- encoded = self.codec.encode_leaf(self.leaf)
- self.assertEqual(self.codec.decode_leaf(encoded), self.leaf)
-
- def test_index_round_trip_ok(self):
- encoded = self.codec.encode_index(self.index)
- self.assertEqual(self.codec.decode_index(encoded), self.index)
-
-
class KeySizeMismatchTests(unittest.TestCase):
def setUp(self):
diff --git a/without-tests b/without-tests
new file mode 100644
index 0000000..7e20ec0
--- /dev/null
+++ b/without-tests
@@ -0,0 +1,2 @@
+./btree/__init__.py
+./btree/nodestore.py