#!/usr/bin/python # Copyright 2011 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import cliapp import ConfigParser import logging import os import re import shutil import subprocess import tempfile import time __version__ = '1.18' class Measurement(object): def __init__(self, time_output): fields = [float(x) for x in time_output.splitlines()] self.user = fields[0] self.system = fields[1] self.real = fields[2] self.maxrss = fields[3] self.new_data = 0 self.repo_size_after = 0 self.repo_bytes_written = 0 self.repo_bytes_read = 0 self.repo_roundtrips = 0 def runcmd(argv, **kwargs): logging.debug('run: %s %s' % (argv, kwargs)) fd, timings = tempfile.mkstemp() time_argv = ['/usr/bin/time', '-o', timings, '--format', '%U\n%S\n%e\n%M'] p = subprocess.Popen(time_argv + argv, **kwargs) out, err = p.communicate() os.remove(timings) data = os.read(fd, 1024**2) os.close(fd) if p.returncode != 0: raise cliapp.AppException('command failed: %s\n%s' % (argv, err)) return Measurement(data), out # Clear Linux kernel buffer and inode caches. # See http://linux-mm.org/Drop_Caches for details. def drop_caches(): def sudo_tee(status): p = subprocess.Popen(['sudo', '-p', 'Password (for clearing cache): ', 'tee', '/proc/sys/vm/drop_caches'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) out, err = p.communicate('%s\n' % status) if p.returncode != 0: raise cliapp.AppException('failed to clear cache') logging.debug('clearing Linux kernel cache') sudo_tee(3) class BackupProgram(object): name = None def __init__(self, live_data, repo, settings): self.live_data = live_data self.repo = repo self.settings = settings def set_meta(self, cp): '''Set [meta] fields in report. These might depend on the program. For example, if running something from a version control branch, a subclass might record the revision id here. ''' def prepare(self): '''Prepare program for benchmark. This might, for example, compile it. ''' def backup(self, nth_gen): '''Run a backup from live_data to repo. This should start a new generation, in whatever way is most appropriate for the backup program. ''' def fsck(self, nth_gen): '''Run fsck on the repository, after the nth backup generation.''' def verify(self, nth_gen): '''Run verify on the repository, after the nth backup generation.''' def list_files(self, nth_gen): '''This should retrieve a list of all files in a generation. The list should be written to /dev/null. ''' def restore(self, nth_gen, target_dir): '''Restore all files in a generation, to a target directory.''' def forget(self, nth_gen): '''Remove a given generation.''' class Obnam(BackupProgram): name = 'obnam' @property def _cmd(self): if self.settings['obnam-branch']: return './obnam' else: return 'obnam' @property def _branch(self): return self.settings['obnam-branch'] or None @property def _revno(self): timings, revno = runcmd(['bzr', 'revno'], cwd=self._branch, stdout=subprocess.PIPE) return revno.strip() @property def _larch_branch(self): return self.settings['larch-branch'] or None @property def _larch_revno(self): timings, revno = runcmd(['bzr', 'revno'], cwd=self._larch_branch, stdout=subprocess.PIPE) return revno.strip() def _run(self, args, nth_gen, **kwargs): fd, logfile = tempfile.mkstemp() os.close(fd) cmd = [self._cmd, '--no-default-configs', '--log', logfile, '--repository', self.repo, '--sftp-delay', str(self.settings['sftp-delay']), '--leave-checkpoints', '--weak-random'] if self.settings['obnam-config']: cmd.extend(['--config', self.settings['obnam-config']]) env = dict(os.environ) if self.settings['obnam-profile']: namepattern = { 'gen': str(nth_gen), 'op': args[0], } env['OBNAM_PROFILE'] = self.settings['obnam-profile'] % namepattern if self._larch_branch: old = env.get('PYTHONPATH') if old: new = '%s:%s' % (self._larch_branch, old) else: new = self._larch_branch env['PYTHONPATH'] = new if self.settings['encrypt-with']: cmd += ['--encrypt-with', self.settings['encrypt-with']] result = runcmd(cmd + args, cwd=self._branch, env=env, **kwargs) self.extract_repository_io(result[0], logfile) os.remove(logfile) return result def extract_repository_io(self, measurement, logfile): pat = re.compile(r' VFS:( __del__:)? baseurl=.* ' r'read=(?P\d+) written=(?P\d+)') pat2 = re.compile(r' VFS: baseurl=.* roundtrips=(?P\d+)') f = open(logfile) for line in f: m = pat.search(line) m2 = pat2.search(line) if m and self.repo in line: measurement.repo_bytes_written = long(m.group('written')) measurement.repo_bytes_read = long(m.group('read')) elif m2 and self.repo in line: measurement.repo_roundtrips = long(m2.group('n')) f.close() def prepare(self): if self._branch: if os.path.exists(os.path.join(self._branch, 'Makefile')): logging.info('Building obnam in %s with make' % self._branch) runcmd(['make'], cwd=self._branch) else: logging.info('Building obnam in %s with setup.py' % self._branch) runcmd(['python', 'setup.py', 'build_ext', '-i']) def backup(self, nth_gen): return self._run(['backup', self.live_data], nth_gen)[0] def fsck(self, nth_gen): return self._run(['fsck'], nth_gen)[0] def verify(self, nth_gen): return self._run(['verify', self.live_data], nth_gen)[0] def _genid(self, nth_gen): timings, out = self._run(['genids'], nth_gen, stdout=subprocess.PIPE) return out.splitlines()[nth_gen] def list_files(self, nth_gen): devnull = os.open('/dev/null', os.O_WRONLY) timings, out = self._run(['ls', self._genid(nth_gen)], nth_gen, stdout=devnull) os.close(devnull) return timings def restore(self, nth_gen, target_dir): return self._run(['restore', '--to', target_dir, '--generation', self._genid(nth_gen)], nth_gen)[0] def forget(self, nth_gen, gen): return self._run(['forget', self._genid(gen)], nth_gen)[0] def set_meta(self, cp): if self._branch: cp.set('meta', 'obnam-branch', self._branch) cp.set('meta', 'obnam-branch-nick', os.path.basename(self._branch)) cp.set('meta', 'revision', self._revno) if self._larch_branch: cp.set('meta', 'larch-revision', self._larch_revno) cp.set('meta', 'sftp-delay', str(self.settings['sftp-delay'])) class BackupProgramFactory(object): programs = [Obnam] def names(self): return [p.name for p in self.programs] def new(self, name, **kwargs): for p in self.programs: if p.name == name: return p(**kwargs) class Report(object): def __init__(self, program): self.program = program self.measurements = dict() @property def generations(self): gens = set() for op in self.measurements: for gen in self.measurements[op]: gens.add(gen) return gens @property def operations(self): return self.measurements.keys() def add_measurement(self, op, gen, measurement): if op not in self.measurements: self.measurements[op] = dict() self.measurements[op][gen] = measurement def get_measurement(self, op, gen): return self.measurements[op][gen] def format(self, fp): cp = ConfigParser.ConfigParser() cp.add_section('meta') cp.set('meta', 'program', self.program.name) if self.program.settings['encrypt-with']: cp.set('meta', 'encrypted', 'yes') else: cp.set('meta', 'encrypted', 'no') if self.program.settings['description']: cp.set('meta', 'description', self.program.settings['description']) if self.program.settings['profile-name']: cp.set('meta', 'profile-name', self.program.settings['profile-name']) self.program.set_meta(cp) for gen in self.generations: section = str(gen) cp.add_section(section) for op in self.operations: m = self.get_measurement(op, gen) for field in ['user', 'system', 'real', 'maxrss']: cp.set(section, '%s.%s' % (op, field), '%.1f' % getattr(m, field)) cp.set(section, '%s.new-data' % op, m.new_data) cp.set(section, '%s.repo-size-after' % op, m.repo_size_after) cp.set(section, '%s.repo-bytes-written' % op, m.repo_bytes_written) cp.set(section, '%s.repo-bytes-read' % op, m.repo_bytes_read) cp.set(section, '%s.repo-roundtrips' % op, m.repo_roundtrips) cp.write(fp) class Seivot(cliapp.Application): def add_settings(self): self.factory = BackupProgramFactory() self.settings.choice(['program'], self.factory.names(), 'program to benchmark (%default)') self.settings.string(['description'], 'free-form description of this backup run', metavar='TEXT') self.settings.string(['profile-name'], 'name of backup use-case profile name ' '(for documentation purposes only)') self.settings.integer(['generations'], 'total number of generations to ' 'measure (%default)', metavar='COUNT', default=5) self.settings.bytesize(['initial-data'], 'size of initial live data (%default)', metavar='SIZE', default=1024) self.settings.bytesize(['incremental-data'], 'add SIZE live data for ' 'additional generations ' '(%default)', metavar='SIZE', default=1024) self.settings.string(['use-existing'], 'use exiting DIR for initial generation', metavar='DIR') self.settings.bytesize(['file-size'], 'size of files to create', default=16*1024) self.settings.string(['obnam-branch'], 'bzr branch from which to run obnam ' '(default is installed obnam)') self.settings.string(['larch-branch'], 'bzr branch from which to use larch ' '(default is installed larch)') self.settings.string(['obnam-profile'], 'store Python profiling output ' 'in files named after NAMEPATTERN ' '(no profiling, unless set); ' '%(foo)s in pattern gets filled ' 'in, where foo is op (for ' 'backup/restore/etc), gen, or ' 'order (cumulative/time)', metavar='NAMEPATTERN', default='') self.settings.string(['encrypt-with'], 'encrypt backups with KEYID', metavar='KEYID') self.settings.boolean(['drop-caches'], 'clear Linux kernel cache before ' 'running commands (will ask for sudo pasword') self.settings.boolean(['use-sftp-repository'], 'access backup repository over the ' 'network via sftp') self.settings.boolean(['use-sftp-root'], 'access backup repository over the ' 'network via sftp') self.settings.integer(['sftp-delay'], 'add an artificial delay (in milliseconds) to ' 'all SFTP transfers') self.settings.boolean(['verify'], 'verify restored data') self.settings.boolean(['benchmark-fsck'], 'benchmark fsck?') self.settings.string(['obnam-config'], 'use FILE as an Obnam configuration file', metavar='FILE') def process_args(self, args): progname = self.settings['program'] logging.info('Benchmarking: %s' % progname) generations = self.settings['generations'] self.tempdir = tempfile.mkdtemp() logging.info('tempdir: %s' % self.tempdir) self.live_data = os.path.join(self.tempdir, 'data') self.repo = os.path.join(self.tempdir, 'repo') if self.settings['use-sftp-root']: live_url = 'sftp://localhost%s' % self.live_data else: live_url = self.live_data if self.settings['use-sftp-repository']: repo_url = 'sftp://localhost%s' % self.repo else: repo_url = self.repo prog = self.factory.new(progname, live_data=live_url, repo=repo_url, settings=self.settings) prog.prepare() self.report = Report(prog) if self.settings['use-existing']: print 'Copying existing data to %s' % self.live_data self.runcmd(['cp', '-a', self.settings['use-existing'], self.live_data]) amount = self.file_sizes(self.live_data) else: amount = self.settings['initial-data'] self.generate_live_data(self.live_data, amount) if self.settings['verify']: self.summain(self.live_data, 'backup-0.summain') self.measure(prog.backup, 0, amount) if self.settings['benchmark-fsck']: self.measure(prog.fsck, 0, amount) self.measure(prog.verify, 0, amount) for i in range(1, generations): self.generate_live_data(self.live_data, self.settings['incremental-data']) if self.settings['verify']: self.summain(self.live_data, 'backup-%d.summain' % i) self.measure(prog.backup, i, self.settings['incremental-data']) if self.settings['benchmark-fsck']: self.measure(prog.fsck, i, self.settings['incremental-data']) self.measure(prog.verify, i, self.settings['incremental-data']) for i in range(generations): self.measure(prog.list_files, i, 0) for i in range(generations): target_dir = os.path.join(self.tempdir, 'restored') os.mkdir(target_dir) self.measure(prog.restore, i, 0, target_dir=target_dir) if self.settings['verify']: self.verify(target_dir, i) shutil.rmtree(target_dir) for i in range(generations): # Since we remove oldest first, we always remove the 0th # generation, not the ith one. self.measure(prog.forget, i, 0, gen=0) self.report.format(self.output) def generate_live_data(self, where, size): logging.info('Generating %d bytes live data' % size) runcmd(['genbackupdata', where, '--create', str(size), '--file-size', str(self.settings['file-size'])]) def summain(self, dirname, basename): '''Remember state of dirname at this time. This runs the summain(1) utility against dirname and stores the result in a file called basename in the temporary directory. ''' # We exclude mtime from summain output, because there are very # small time differences that I choose to ignore at this time. self.runcmd(['summain', '--relative', '--exclude=mtime', '--output', os.path.join(self.tempdir, basename), dirname]) def verify(self, dirname, generation): '''Verify that the generation was restored correctly.''' root = os.path.join(dirname, './' + self.live_data) self.summain(root, 'restored-%d.summain' % generation) orig = os.path.join(self.tempdir, 'backup-%d.summain' % generation) rest = os.path.join(self.tempdir, 'restored-%d.summain' % generation) self.runcmd(['diff', '-u', orig, rest]) def file_sizes(self, dirname): bytes = 0 for dirname, subdirs, basenames in os.walk(dirname): for filename in [os.path.join(dirname, x) for x in basenames]: if os.path.isfile(filename): bytes += os.path.getsize(filename) return bytes def measure(self, func, nth_gen, new_data, **kwargs): logging.info('Measuring %s gen %d' % (func.__name__, nth_gen)) print 'Measuring %s generation %s' % (func.__name__, nth_gen) if self.settings['drop-caches']: drop_caches() measurement = func(nth_gen, **kwargs) measurement.new_data = new_data measurement.repo_size_after = self.disk_usage(self.repo) self.report.add_measurement(func.__name__, nth_gen, measurement) def cleanup(self): if os.path.exists(self.tempdir): logging.info('Removing temporary directory %s' % self.tempdir) shutil.rmtree(self.tempdir) def disk_usage(self, pathname): '''Simulate du(1). Return disk usage in bytes.''' def getsize(filename): return os.lstat(filename).st_blocks * 512 bytes = 0 for dirname, subdirs, basenames in os.walk(pathname): bytes += getsize(dirname) bytes += sum(getsize(os.path.join(dirname, x)) for x in basenames) return bytes if __name__ == '__main__': Seivot(version=__version__).run()