summaryrefslogtreecommitdiff
path: root/summainlib.py
blob: 41ff5cee8e7eb5d78b8bed99e966b143695eb021 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# Copyright (C) 2010, 2011  Lars Wirzenius
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import grp
import hashlib
import math
import os
import pwd
import stat
import time
import urllib


__version__ = '0.8'


class NumberNormalizer(object):

    '''Normalize inode and device numbers.
    
    When we make two manifests of the same directory tree, but the
    tree may have been moved to another disk, the inode and device
    numbers may be different. This should not be a cause for concern,
    however. What is important is that if two names were hardlinked
    to the same file before, they still are, and if they weren't,
    they still aren't.
    
    To achieve this, we normalize the inode and device numbers.
    The input files are fed to the normalizer in a deterministic
    sequence, and the sequence defines the numbers we use. Thus,
    if the input files have inode numbers [42, 13, 105], we produce
    [1, 2, 3]. If one of the input numbers is repeated, that number
    is re-used.
    
    This is not a perfect solution. If the second manifest has a
    new file, it will throw off the entire remaining sequence, causing
    a big diff. But we'll live with that.
    
    '''
    
    def __init__(self):
        self.reset()

    def get(self, input_number, numbers, next):
        if input_number in numbers:
            return numbers[input_number], next
        else:
            numbers[input_number] = next
            return numbers[input_number], next + 1
    
    def get_ino(self, ino):
        output, self.next_ino = self.get(ino, self.ino_numbers, self.next_ino)
        return output
    
    def get_dev(self, dev):
        output, self.next_dev = self.get(dev, self.dev_numbers, self.next_dev)
        return output

    def reset(self):
        '''This is used by unit tests.'''
        self.ino_numbers = dict()
        self.next_ino = 1

        self.dev_numbers = dict()
        self.next_dev = 1


class PathNormalizer(object):

    '''Normalize a filesystem path.
    
    For every input path, a new output path is given. The same output
    path is given every time the same input path is given. The output
    path has the same number of elements as the input path, and if
    /foo/bar results in /a/b, then /foo/yo results in /a/b, i.e., the
    tree structure is the same.
    
    '''
    
    def __init__(self):
        self._counter = 0
        self._dict = dict()
        self._dict[os.sep] = os.sep
        self._dict['.'] = '.'
        self._dict['..'] = '..'

    def split(self, path):
        if path == os.sep:
            return [os.sep]
        parts = path.split(os.sep)
        if parts:
            if parts[0] == '':
                parts[0] = os.sep
            if parts[-1] == '':
                parts[-1] = os.sep
        return parts
        
    def _base26(self, n):
        if n == 0:
            digits = [0]
        else:
            digits = []
            while n > 0:
                digits.append(n % 26)
                n /= 26

        letters = 'abcdefghijklmnopqrstuvwxyz'
        assert len(letters) == 26
        return ''.join(letters[x] for x in reversed(digits))
        
    def normalize_part(self, part):
        if part not in self._dict:
            self._dict[part] = self._base26(self._counter)
            self._counter += 1
        return self._dict[part]
        
    def normalize(self, path):
        parts = self.split(path)
        normalized = [self.normalize_part(x) for x in parts]
        result = ''
        for x in normalized:
            if not result:
                result = x
            elif x == os.sep:
                result += x
            elif result.endswith(os.sep):
                result += x
            else:
                result += os.sep + x
        return result


class SamePath(object): # pragma: no cover

    def normalize(self, path):
        return path


class FilesystemObject(object):

    '''An object in the file system.
    
    Responsible for gathering information and formatting it for
    reporting.
    
    The optional arguments are intended for unit tests.
    
    '''
    
    def __init__(self, filename, nn, pn, exclude, checksums, 
                 stat_result=None, sha1=None, sha224=None,
                 sha256=None, sha384=None, sha512=None,
                 md5=None, open_file=None, readlink=None):
        self._filename = filename
        self._exclude = set(self._normalize_key(k) for k in exclude)
        self._checksums = checksums
        self._pn = pn
        self._nn = nn
        self._md5 = md5 or hashlib.md5()
        self._sha1 = sha1 or hashlib.sha1()
        self._sha224 = sha224 or hashlib.sha224()
        self._sha256 = sha256 or hashlib.sha256()
        self._sha384 = sha384 or hashlib.sha384()
        self._sha512 = sha512 or hashlib.sha512()
        self._stat_result = stat_result or os.lstat(filename)
        self.open_file = open_file or file
        self.readlink = readlink or os.readlink
        self.values = dict()

    def _compute_name(self):
        return urllib.quote(self._pn.normalize(self._filename))

    def _compute_mtime(self):
        return self.format_time(self._stat_result.st_mtime)

    def _compute_mode(self):
        return '%o' % self._stat_result.st_mode

    def _compute_ino(self):
        return '%d' % self._nn.get_ino(self._stat_result.st_ino)

    def _compute_dev(self):
        return '%d' % self._nn.get_dev(self._stat_result.st_dev)

    def _compute_nlink(self):
        return '%d' % self._stat_result.st_nlink

    def _compute_size(self):
        if not stat.S_ISDIR(self._stat_result.st_mode):
            return '%d' % self._stat_result.st_size

    def _compute_uid(self):
        return '%d' % self._stat_result.st_uid

    def _compute_username(self):
        return self.lookup_username(self._stat_result.st_uid)

    def _compute_gid(self):
        return '%d' % self._stat_result.st_gid

    def _compute_group(self):
        return self.lookup_group(self._stat_result.st_gid)

    def _compute_md5(self):
        return self.compute_checksum(self._filename, self._md5)

    def _compute_sha1(self):
        return self.compute_checksum(self._filename, self._sha1)

    def _compute_sha224(self):
        return self.compute_checksum(self._filename, self._sha224)

    def _compute_sha256(self):
        return self.compute_checksum(self._filename, self._sha256)

    def _compute_sha384(self):
        return self.compute_checksum(self._filename, self._sha384)

    def _compute_sha512(self):
        return self.compute_checksum(self._filename, self._sha512)

    def _compute_target(self):
        if stat.S_ISLNK(self._stat_result.st_mode):
            return self.readlink(self._filename)

    def format_time(self, timestamp):
        s = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(timestamp))
        fract, whole = math.modf(timestamp)
        s += ('%.6f' % fract)[1:] # Skip leading '.'
        s += ' +0000'
        return s

    def lookup_username(self, uid):
        return pwd.getpwuid(uid).pw_name

    def lookup_group(self, gid):
        return grp.getgrgid(gid).gr_name

    def compute_checksum(self, filename, checksummer):
        if stat.S_ISREG(self._stat_result.st_mode):
            with self.open_file(filename) as f:
                while True:
                    data = f.read(64*1024) # 64 KiB seems reasonable.
                    if not data:
                        break
                    checksummer.update(data)
            return checksummer.hexdigest()
        else:
            return ''

    def _normalize_key(self, key):
        key = key.lower()
        key = '_'.join(key.split('-'))
        return key
        
    def __getitem__(self, key):
        normalized = self._normalize_key(key)
        if normalized in self._exclude:
            return ''
        if key not in self.values:
            method = '_compute_%s' % self._normalize_key(key)
            if hasattr(self, method):
                value = getattr(self, method)()
                if value is not None:
                    self.values[key] = value
            else:
                raise KeyError(key)
        return self.values.get(key, '')

    def _isdir(self):
        '''Is this a directory?'''
        
        return stat.S_ISDIR(int(self['Mode'], 8))

    def relative_path(self, root):
        '''Return a path that is relative to root, if possible.
        
        If pathname does not start with root, then return it
        unmodified.
        
        '''
        
        if root.endswith(os.sep):
            root2 = root
        else:
            root2 = root + os.sep
        pathname = self['Name']
        if pathname.startswith(root2):
            return pathname[len(root2):]
        elif pathname == root and self._isdir():
            return '.'
        else:
            return pathname

    def format(self, root=None): # pragma: no cover
        if root is None:
            name = self['Name']
        else:
            name = self.relative_path(root)
        
        keys = (['Mtime', 'Mode', 'Ino', 'Dev', 'Nlink', 'Size',
                 'Uid', 'Username', 'Gid', 'Group', 'Target'] +
                self._checksums)
        values = [('Name', name)]
        values += [(k, self[k]) for k in keys if self[k] != '']
        return ''.join('%s: %s\n' % (k, v) for k, v in values if v != '')