#!/usr/bin/python
# Copyright 2011 Lars Wirzenius
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import cliapp
import ConfigParser
import logging
import os
import re
import shutil
import subprocess
import tempfile
import time
__version__ = '1.18'
class Measurement(object):
def __init__(self, time_output):
fields = [float(x) for x in time_output.splitlines()]
self.user = fields[0]
self.system = fields[1]
self.real = fields[2]
self.maxrss = fields[3]
self.new_data = 0
self.repo_size_after = 0
self.repo_bytes_written = 0
self.repo_bytes_read = 0
self.repo_roundtrips = 0
def runcmd(argv, **kwargs):
logging.debug('run: %s %s' % (argv, kwargs))
fd, timings = tempfile.mkstemp()
time_argv = ['/usr/bin/time',
'-o', timings,
'--format', '%U\n%S\n%e\n%M']
p = subprocess.Popen(time_argv + argv, **kwargs)
out, err = p.communicate()
os.remove(timings)
data = os.read(fd, 1024**2)
os.close(fd)
if p.returncode != 0:
raise cliapp.AppException('command failed: %s\n%s' % (argv, err))
return Measurement(data), out
# Clear Linux kernel buffer and inode caches.
# See http://linux-mm.org/Drop_Caches for details.
def drop_caches():
def sudo_tee(status):
p = subprocess.Popen(['sudo', '-p', 'Password (for clearing cache): ',
'tee', '/proc/sys/vm/drop_caches'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
out, err = p.communicate('%s\n' % status)
if p.returncode != 0:
raise cliapp.AppException('failed to clear cache')
logging.debug('clearing Linux kernel cache')
sudo_tee(3)
class BackupProgram(object):
name = None
def __init__(self, live_data, repo, settings):
self.live_data = live_data
self.repo = repo
self.settings = settings
def set_meta(self, cp):
'''Set [meta] fields in report.
These might depend on the program. For example, if running
something from a version control branch, a subclass might
record the revision id here.
'''
def prepare(self):
'''Prepare program for benchmark.
This might, for example, compile it.
'''
def backup(self, nth_gen):
'''Run a backup from live_data to repo.
This should start a new generation, in whatever way is
most appropriate for the backup program.
'''
def fsck(self, nth_gen):
'''Run fsck on the repository, after the nth backup generation.'''
def verify(self, nth_gen):
'''Run verify on the repository, after the nth backup generation.'''
def list_files(self, nth_gen):
'''This should retrieve a list of all files in a generation.
The list should be written to /dev/null.
'''
def restore(self, nth_gen, target_dir):
'''Restore all files in a generation, to a target directory.'''
def forget(self, nth_gen):
'''Remove a given generation.'''
class Obnam(BackupProgram):
name = 'obnam'
@property
def _cmd(self):
if self.settings['obnam-branch']:
return './obnam'
else:
return 'obnam'
@property
def _branch(self):
return self.settings['obnam-branch'] or None
@property
def _revno(self):
timings, revno = runcmd(['bzr', 'revno'], cwd=self._branch,
stdout=subprocess.PIPE)
return revno.strip()
@property
def _larch_branch(self):
return self.settings['larch-branch'] or None
@property
def _larch_revno(self):
timings, revno = runcmd(['bzr', 'revno'], cwd=self._larch_branch,
stdout=subprocess.PIPE)
return revno.strip()
def _run(self, args, nth_gen, **kwargs):
fd, logfile = tempfile.mkstemp()
os.close(fd)
cmd = [self._cmd,
'--no-default-configs',
'--log', logfile,
'--repository', self.repo,
'--sftp-delay', str(self.settings['sftp-delay']),
'--leave-checkpoints',
'--weak-random']
if self.settings['obnam-config']:
cmd.extend(['--config', self.settings['obnam-config']])
env = dict(os.environ)
if self.settings['obnam-profile']:
namepattern = {
'gen': str(nth_gen),
'op': args[0],
}
env['OBNAM_PROFILE'] = self.settings['obnam-profile'] % namepattern
if self._larch_branch:
old = env.get('PYTHONPATH')
if old:
new = '%s:%s' % (self._larch_branch, old)
else:
new = self._larch_branch
env['PYTHONPATH'] = new
if self.settings['encrypt-with']:
cmd += ['--encrypt-with', self.settings['encrypt-with']]
result = runcmd(cmd + args, cwd=self._branch, env=env, **kwargs)
self.extract_repository_io(result[0], logfile)
os.remove(logfile)
return result
def extract_repository_io(self, measurement, logfile):
pat = re.compile(r' VFS:( __del__:)? baseurl=.* '
r'read=(?P\d+) written=(?P\d+)')
pat2 = re.compile(r' VFS: baseurl=.* roundtrips=(?P\d+)')
f = open(logfile)
for line in f:
m = pat.search(line)
m2 = pat2.search(line)
if m and self.repo in line:
measurement.repo_bytes_written = long(m.group('written'))
measurement.repo_bytes_read = long(m.group('read'))
elif m2 and self.repo in line:
measurement.repo_roundtrips = long(m2.group('n'))
f.close()
def prepare(self):
if self._branch:
if os.path.exists(os.path.join(self._branch, 'Makefile')):
logging.info('Building obnam in %s with make' % self._branch)
runcmd(['make'], cwd=self._branch)
else:
logging.info('Building obnam in %s with setup.py' %
self._branch)
runcmd(['python', 'setup.py', 'build_ext', '-i'])
def backup(self, nth_gen):
return self._run(['backup', self.live_data], nth_gen)[0]
def fsck(self, nth_gen):
return self._run(['fsck'], nth_gen)[0]
def verify(self, nth_gen):
return self._run(['verify', self.live_data], nth_gen)[0]
def _genid(self, nth_gen):
timings, out = self._run(['genids'], nth_gen, stdout=subprocess.PIPE)
return out.splitlines()[nth_gen]
def list_files(self, nth_gen):
devnull = os.open('/dev/null', os.O_WRONLY)
timings, out = self._run(['ls', self._genid(nth_gen)], nth_gen,
stdout=devnull)
os.close(devnull)
return timings
def restore(self, nth_gen, target_dir):
return self._run(['restore', '--to', target_dir,
'--generation', self._genid(nth_gen)], nth_gen)[0]
def forget(self, nth_gen, gen):
return self._run(['forget', self._genid(gen)], nth_gen)[0]
def set_meta(self, cp):
if self._branch:
cp.set('meta', 'obnam-branch', self._branch)
cp.set('meta', 'obnam-branch-nick', os.path.basename(self._branch))
cp.set('meta', 'revision', self._revno)
if self._larch_branch:
cp.set('meta', 'larch-revision', self._larch_revno)
cp.set('meta', 'sftp-delay', str(self.settings['sftp-delay']))
class BackupProgramFactory(object):
programs = [Obnam]
def names(self):
return [p.name for p in self.programs]
def new(self, name, **kwargs):
for p in self.programs:
if p.name == name:
return p(**kwargs)
class Report(object):
def __init__(self, program):
self.program = program
self.measurements = dict()
@property
def generations(self):
gens = set()
for op in self.measurements:
for gen in self.measurements[op]:
gens.add(gen)
return gens
@property
def operations(self):
return self.measurements.keys()
def add_measurement(self, op, gen, measurement):
if op not in self.measurements:
self.measurements[op] = dict()
self.measurements[op][gen] = measurement
def get_measurement(self, op, gen):
return self.measurements[op][gen]
def format(self, fp):
cp = ConfigParser.ConfigParser()
cp.add_section('meta')
cp.set('meta', 'program', self.program.name)
if self.program.settings['encrypt-with']:
cp.set('meta', 'encrypted', 'yes')
else:
cp.set('meta', 'encrypted', 'no')
if self.program.settings['description']:
cp.set('meta', 'description', self.program.settings['description'])
if self.program.settings['profile-name']:
cp.set('meta', 'profile-name',
self.program.settings['profile-name'])
self.program.set_meta(cp)
for gen in self.generations:
section = str(gen)
cp.add_section(section)
for op in self.operations:
m = self.get_measurement(op, gen)
for field in ['user', 'system', 'real', 'maxrss']:
cp.set(section, '%s.%s' % (op, field),
'%.1f' % getattr(m, field))
cp.set(section, '%s.new-data' % op, m.new_data)
cp.set(section, '%s.repo-size-after' % op, m.repo_size_after)
cp.set(section, '%s.repo-bytes-written' % op,
m.repo_bytes_written)
cp.set(section, '%s.repo-bytes-read' % op,
m.repo_bytes_read)
cp.set(section, '%s.repo-roundtrips' % op,
m.repo_roundtrips)
cp.write(fp)
class Seivot(cliapp.Application):
def add_settings(self):
self.factory = BackupProgramFactory()
self.settings.choice(['program'], self.factory.names(),
'program to benchmark (%default)')
self.settings.string(['description'],
'free-form description of this backup run',
metavar='TEXT')
self.settings.string(['profile-name'],
'name of backup use-case profile name '
'(for documentation purposes only)')
self.settings.integer(['generations'],
'total number of generations to '
'measure (%default)',
metavar='COUNT',
default=5)
self.settings.bytesize(['initial-data'],
'size of initial live data (%default)',
metavar='SIZE',
default=1024)
self.settings.bytesize(['incremental-data'],
'add SIZE live data for '
'additional generations '
'(%default)',
metavar='SIZE',
default=1024)
self.settings.string(['use-existing'],
'use exiting DIR for initial generation',
metavar='DIR')
self.settings.bytesize(['file-size'], 'size of files to create',
default=16*1024)
self.settings.string(['obnam-branch'],
'bzr branch from which to run obnam '
'(default is installed obnam)')
self.settings.string(['larch-branch'],
'bzr branch from which to use larch '
'(default is installed larch)')
self.settings.string(['obnam-profile'],
'store Python profiling output '
'in files named after NAMEPATTERN '
'(no profiling, unless set); '
'%(foo)s in pattern gets filled '
'in, where foo is op (for '
'backup/restore/etc), gen, or '
'order (cumulative/time)',
metavar='NAMEPATTERN',
default='')
self.settings.string(['encrypt-with'],
'encrypt backups with KEYID',
metavar='KEYID')
self.settings.boolean(['drop-caches'],
'clear Linux kernel cache before '
'running commands (will ask for sudo pasword')
self.settings.boolean(['use-sftp-repository'],
'access backup repository over the '
'network via sftp')
self.settings.boolean(['use-sftp-root'],
'access backup repository over the '
'network via sftp')
self.settings.integer(['sftp-delay'],
'add an artificial delay (in milliseconds) to '
'all SFTP transfers')
self.settings.boolean(['verify'], 'verify restored data')
self.settings.boolean(['benchmark-fsck'], 'benchmark fsck?')
self.settings.string(['obnam-config'],
'use FILE as an Obnam configuration file',
metavar='FILE')
def process_args(self, args):
progname = self.settings['program']
logging.info('Benchmarking: %s' % progname)
generations = self.settings['generations']
self.tempdir = tempfile.mkdtemp()
logging.info('tempdir: %s' % self.tempdir)
self.live_data = os.path.join(self.tempdir, 'data')
self.repo = os.path.join(self.tempdir, 'repo')
if self.settings['use-sftp-root']:
live_url = 'sftp://localhost%s' % self.live_data
else:
live_url = self.live_data
if self.settings['use-sftp-repository']:
repo_url = 'sftp://localhost%s' % self.repo
else:
repo_url = self.repo
prog = self.factory.new(progname, live_data=live_url,
repo=repo_url,
settings=self.settings)
prog.prepare()
self.report = Report(prog)
if self.settings['use-existing']:
print 'Copying existing data to %s' % self.live_data
self.runcmd(['cp', '-a', self.settings['use-existing'],
self.live_data])
amount = self.file_sizes(self.live_data)
else:
amount = self.settings['initial-data']
self.generate_live_data(self.live_data, amount)
if self.settings['verify']:
self.summain(self.live_data, 'backup-0.summain')
self.measure(prog.backup, 0, amount)
if self.settings['benchmark-fsck']:
self.measure(prog.fsck, 0, amount)
self.measure(prog.verify, 0, amount)
for i in range(1, generations):
self.generate_live_data(self.live_data,
self.settings['incremental-data'])
if self.settings['verify']:
self.summain(self.live_data, 'backup-%d.summain' % i)
self.measure(prog.backup, i, self.settings['incremental-data'])
if self.settings['benchmark-fsck']:
self.measure(prog.fsck, i, self.settings['incremental-data'])
self.measure(prog.verify, i, self.settings['incremental-data'])
for i in range(generations):
self.measure(prog.list_files, i, 0)
for i in range(generations):
target_dir = os.path.join(self.tempdir, 'restored')
os.mkdir(target_dir)
self.measure(prog.restore, i, 0, target_dir=target_dir)
if self.settings['verify']:
self.verify(target_dir, i)
shutil.rmtree(target_dir)
for i in range(generations):
# Since we remove oldest first, we always remove the 0th
# generation, not the ith one.
self.measure(prog.forget, i, 0, gen=0)
self.report.format(self.output)
def generate_live_data(self, where, size):
logging.info('Generating %d bytes live data' % size)
runcmd(['genbackupdata', where, '--create', str(size),
'--file-size', str(self.settings['file-size'])])
def summain(self, dirname, basename):
'''Remember state of dirname at this time.
This runs the summain(1) utility against dirname and stores
the result in a file called basename in the temporary directory.
'''
# We exclude mtime from summain output, because there are very
# small time differences that I choose to ignore at this time.
self.runcmd(['summain', '--relative', '--exclude=mtime', '--output',
os.path.join(self.tempdir, basename), dirname])
def verify(self, dirname, generation):
'''Verify that the generation was restored correctly.'''
root = os.path.join(dirname, './' + self.live_data)
self.summain(root, 'restored-%d.summain' % generation)
orig = os.path.join(self.tempdir, 'backup-%d.summain' % generation)
rest = os.path.join(self.tempdir, 'restored-%d.summain' % generation)
self.runcmd(['diff', '-u', orig, rest])
def file_sizes(self, dirname):
bytes = 0
for dirname, subdirs, basenames in os.walk(dirname):
for filename in [os.path.join(dirname, x) for x in basenames]:
if os.path.isfile(filename):
bytes += os.path.getsize(filename)
return bytes
def measure(self, func, nth_gen, new_data, **kwargs):
logging.info('Measuring %s gen %d' % (func.__name__, nth_gen))
print 'Measuring %s generation %s' % (func.__name__, nth_gen)
if self.settings['drop-caches']:
drop_caches()
measurement = func(nth_gen, **kwargs)
measurement.new_data = new_data
measurement.repo_size_after = self.disk_usage(self.repo)
self.report.add_measurement(func.__name__, nth_gen, measurement)
def cleanup(self):
if os.path.exists(self.tempdir):
logging.info('Removing temporary directory %s' % self.tempdir)
shutil.rmtree(self.tempdir)
def disk_usage(self, pathname):
'''Simulate du(1). Return disk usage in bytes.'''
def getsize(filename):
return os.lstat(filename).st_blocks * 512
bytes = 0
for dirname, subdirs, basenames in os.walk(pathname):
bytes += getsize(dirname)
bytes += sum(getsize(os.path.join(dirname, x)) for x in basenames)
return bytes
if __name__ == '__main__':
Seivot(version=__version__).run()