summaryrefslogtreecommitdiff
path: root/larch/codec.py
blob: aa086abf4bf7fc94eeef47ece408c5ab95d5db3f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Copyright 2010  Lars Wirzenius
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import struct

import larch


class CodecError(larch.Error):

    def __init__(self, msg):
        self.msg = msg


class NodeCodec(object):

    '''Encode and decode nodes from their binary format.
    
    Node identifiers are assumed to fit into 64 bits.
    
    Leaf node values are assumed to fit into 4 gibibytes.
    
    '''
    
    format = 1

    # We use the struct module for encoding and decoding. For speed,
    # we construct the format string all at once, so that there is only
    # one call to struct.pack or struct.unpack for one node. This brought
    # a thousand time speedup over doing it one field at a time. However,
    # the code is not quite as clear as it might be, what with no symbolic
    # names for anything is used anymore. Patches welcome.
    
    def __init__(self, key_bytes):
        self.key_bytes = key_bytes
        self.leaf_header = struct.Struct('!4sQI')
        self.index_header = struct.Struct('!4sQI')
        # space for key and length of value is needed for each pair
        self.leaf_pair_fixed_size = key_bytes + struct.calcsize('!I')
        self.index_pair_size = key_bytes + struct.calcsize('!Q')
        
    def leaf_size(self, keys, values):
        '''Return size of a leaf node with the given pairs.'''
        return (self.leaf_header.size + len(keys) * self.leaf_pair_fixed_size +
                len(''.join([value for value in values])))

    def leaf_size_delta_add(self, old_size, value):
        '''Return size of node that gets a new key/value pair added.
        
        ``old_size`` is the old size of the node. The key must not already
        have existed in the node.
        
        '''
        
        delta = self.leaf_pair_fixed_size + len(value)
        return old_size + delta

    def leaf_size_delta_replace(self, old_size, old_value, new_value):
        '''Return size of node that gets a value replaced.'''
        
        return old_size + len(new_value) - len(old_value)

    def encode_leaf(self, node):
        '''Encode a leaf node as a byte string.'''

        keys = node.keys()
        values = node.values()
        return (self.leaf_header.pack('ORBL', node.id, len(keys)) +
                ''.join(keys) +
                struct.pack('!%dI' % len(values), *map(len, values)) +
                ''.join(values))

    def decode_leaf(self, encoded):
        '''Decode a leaf node from its encoded byte string.'''

        buf = buffer(encoded)
        cookie, node_id, num_pairs = self.leaf_header.unpack_from(buf)
        if cookie != 'ORBL':
            raise CodecError('Leaf node does not begin with magic cookie '
                             '(should be ORBL, is %s)' % repr(cookie))
        fmt = '!' + ('%ds' % self.key_bytes) * num_pairs + 'I' * num_pairs
        items = struct.unpack_from(fmt, buf, self.leaf_header.size)
        keys = items[:num_pairs]
        lengths = items[num_pairs:num_pairs*2]
        values = []
        offset = self.leaf_header.size + self.leaf_pair_fixed_size * num_pairs
        append = values.append
        for length in lengths:
            append(encoded[offset:offset + length])
            offset += length
        return larch.LeafNode(node_id, keys, values)

    def max_index_pairs(self, node_size):
        '''Return number of index pairs that fit in a node of a given size.'''
        return (node_size - self.index_header.size) / self.index_pair_size
        
    def index_size(self, keys, values):
        '''Return size of an index node with the given pairs.'''
        return self.index_header.size + self.index_pair_size * len(keys)

    def encode_index(self, node):
        '''Encode an index node as a byte string.'''

        keys = node.keys()
        child_ids = node.values()
        return (self.index_header.pack('ORBI', node.id, len(keys)) +
                ''.join(keys) +
                struct.pack('!%dQ' % len(child_ids), *child_ids))

    def decode_index(self, encoded):
        '''Decode an index node from its encoded byte string.'''

        buf = buffer(encoded)
        cookie, node_id, num_pairs = self.index_header.unpack_from(buf)
        if cookie != 'ORBI':
            raise CodecError('Index node does not begin with magic cookie '
                             '(should be ORBI, is %s)' % repr(cookie))
        fmt = '!' + ('%ds' % self.key_bytes) * num_pairs + 'Q' * num_pairs
        items = struct.unpack_from(fmt, buf, self.index_header.size)
        keys = items[:num_pairs]
        child_ids = items[num_pairs:]
        assert len(keys) == len(child_ids)
        for x in child_ids:
            assert type(x) == int
        return larch.IndexNode(node_id, keys, child_ids)

    def encode(self, node):
        '''Encode a node of any type.'''
        if isinstance(node, larch.LeafNode):
            return self.encode_leaf(node)
        else:
            return self.encode_index(node)

    def decode(self, encoded):
        '''Decode node of any type.'''
        if encoded.startswith('ORBL'):
            return self.decode_leaf(encoded)
        elif encoded.startswith('ORBI'):
            return self.decode_index(encoded)
        else:
            raise CodecError('Unknown magic cookie in encoded node (%s)' %
                             repr(encoded[:4]))

    def size(self, node):
        '''Return encoded size of a node, regardless of type.'''
        keys = node.keys()
        values = node.values()
        if isinstance(node, larch.LeafNode):
            return self.leaf_size(keys, values)
        else:
            return self.index_size(keys, values)