# Copyright (C) 2010, 2011 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import base64 import grp import hashlib import hmac import os import pwd import stat import time import urllib import errno import _summain __version__ = '0.20' class NumberNormalizer(object): '''Normalize inode and device numbers. When we make two manifests of the same directory tree, but the tree may have been moved to another disk, the inode and device numbers may be different. This should not be a cause for concern, however. What is important is that if two names were hardlinked to the same file before, they still are, and if they weren't, they still aren't. To achieve this, we normalize the inode and device numbers. The input files are fed to the normalizer in a deterministic sequence, and the sequence defines the numbers we use. Thus, if the input files have inode numbers [42, 13, 105], we produce [1, 2, 3]. If one of the input numbers is repeated, that number is re-used. This is not a perfect solution. If the second manifest has a new file, it will throw off the entire remaining sequence, causing a big diff. But we'll live with that. ''' def __init__(self): self.reset() def get(self, input_number, numbers, next_number): if input_number in numbers: return numbers[input_number], next_number else: numbers[input_number] = next_number return numbers[input_number], next_number + 1 def get_ino(self, ino): output, self.next_ino = self.get(ino, self.ino_numbers, self.next_ino) return output def get_dev(self, dev): output, self.next_dev = self.get(dev, self.dev_numbers, self.next_dev) return output def reset(self): '''This is used by unit tests.''' self.ino_numbers = dict() self.next_ino = 1 self.dev_numbers = dict() self.next_dev = 1 class PathNormalizer(object): '''Normalize a filesystem path. Paths are normalized by using SHA-1 on a secret plus the real path. The checksum is the normalized path. ''' def __init__(self, secret): self._secret = secret def normalize(self, path): return hmac.new(self._secret, path).hexdigest() class SamePath(object): # pragma: no cover def normalize(self, path): return path RESULT_RET = 0 RESULT_DEV = 1 RESULT_INO = 2 RESULT_MODE = 3 RESULT_NLINK = 4 RESULT_UID = 5 RESULT_GID = 6 RESULT_RDEV = 7 RESULT_SIZE = 8 RESULT_BLKSIZE = 9 RESULT_BLOCKS = 10 RESULT_ATIME_SEC = 11 RESULT_ATIME_NSEC = 12 RESULT_MTIME_SEC = 13 RESULT_MTIME_NSEC = 14 RESULT_CTIME_SEC = 15 RESULT_CTIME_NSEC = 16 class FilesystemObject(object): '''An object in the file system. Responsible for gathering information and formatting it for reporting. The optional arguments are intended for unit tests. ''' def __init__(self, filename, nn, pn, exclude, stat_result=None, sha1=None, sha224=None, sha256=None, sha384=None, sha512=None, md5=None, open_file=None, readlink=None, xattrs=None): self.filename = filename self.relative = None self._exclude = set(self._normalize_key(k) for k in exclude) self._pn = pn self._nn = nn self._md5 = md5 or hashlib.md5() self._sha1 = sha1 or hashlib.sha1() self._sha224 = sha224 or hashlib.sha224() self._sha256 = sha256 or hashlib.sha256() self._sha384 = sha384 or hashlib.sha384() self._sha512 = sha512 or hashlib.sha512() self._stat_result = stat_result or _summain.lstat(filename) self._xattrs = (xattrs if xattrs is not None else self.get_xattrs(filename)) self.open_file = open_file or file self.readlink = readlink or os.readlink self.values = dict() def _compute_name(self): if self.relative is None: name = self.filename else: name = self.relative # pragma: no cover return urllib.quote(self._pn.normalize(name)) def _compute_mtime(self): return self.format_time(self._stat_result[RESULT_MTIME_SEC], self._stat_result[RESULT_MTIME_NSEC]) def _compute_mode(self): return '%o' % self._stat_result[RESULT_MODE] def _compute_ino(self): return '%d' % self._nn.get_ino(self._stat_result[RESULT_INO]) def _compute_dev(self): return '%d' % self._nn.get_dev(self._stat_result[RESULT_DEV]) def _compute_nlink(self): return '%d' % self._stat_result[RESULT_NLINK] def _compute_size(self): if not stat.S_ISDIR(self._stat_result[RESULT_MODE]): return '%d' % self._stat_result[RESULT_SIZE] def _compute_uid(self): return '%d' % self._stat_result[RESULT_UID] def _compute_username(self): return self.lookup_username(self._stat_result[RESULT_UID]) def _compute_gid(self): return '%d' % self._stat_result[RESULT_GID] def _compute_group(self): return self.lookup_group(self._stat_result[RESULT_GID]) def _compute_md5(self): return self.compute_checksum(self.filename, self._md5) def _compute_sha1(self): return self.compute_checksum(self.filename, self._sha1) def _compute_sha224(self): return self.compute_checksum(self.filename, self._sha224) def _compute_sha256(self): return self.compute_checksum(self.filename, self._sha256) def _compute_sha384(self): return self.compute_checksum(self.filename, self._sha384) def _compute_sha512(self): return self.compute_checksum(self.filename, self._sha512) def _compute_target(self): if stat.S_ISLNK(self._stat_result[RESULT_MODE]): return self.readlink(self.filename) def _compute_xattrs(self): # pragma: no cover if len(self._xattrs) == 0: return '' def quote(s): if s.isalnum(): return '"%s"' % s else: return '0s' + base64.urlsafe_b64encode(s) parts = [' %s=%s' % (k, quote(self._xattrs[k])) for k in self._xattrs] return '\n' + '\n'.join(parts) def format_time(self, secs, nsecs): s = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(secs)) s += '.%09d' % nsecs s += ' +0000' return s def lookup_username(self, uid): return pwd.getpwuid(uid).pw_name def lookup_group(self, gid): return grp.getgrgid(gid).gr_name def compute_checksum(self, filename, checksummer): if stat.S_ISREG(self._stat_result[RESULT_MODE]): with self.open_file(filename) as f: while True: data = f.read(64 * 1024) # 64 KiB seems reasonable. if not data: break checksummer.update(data) return checksummer.hexdigest() else: return '' def _normalize_key(self, key): key = key.lower() key = '_'.join(key.split('-')) return key def __getitem__(self, key): normalized = self._normalize_key(key) if normalized in self._exclude: return '' if key not in self.values: method = '_compute_%s' % self._normalize_key(key) if hasattr(self, method): value = getattr(self, method)() if value is not None: self.values[key] = value else: raise KeyError(key) return self.values.get(key, '') def isdir(self): # pragma: no cover '''Is this a directory?''' return stat.S_ISDIR(int(self['Mode'], 8)) def get_xattrs(self, filename): # pragma: no cover ret = _summain.llistxattr(filename) if type(ret) is int: # Some file types don't support xattr, e.g. named pipes on FreeBSD: if ret == errno.EOPNOTSUPP: return {} raise OSError((ret, os.strerror(ret), filename)) names = [s for s in ret.split('\0') if s] xattrs = {} for name in names: xattrs[name] = _summain.lgetxattr(filename, name) return xattrs