From 0a52be884af28f8564ecbe0caa544f732040eb10 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Fri, 12 Feb 2016 11:44:10 +0200 Subject: Add a manual and yarn test suite --- manual.yarn | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 manual.yarn diff --git a/manual.yarn b/manual.yarn new file mode 100644 index 0000000..c117520 --- /dev/null +++ b/manual.yarn @@ -0,0 +1,174 @@ +--- +title: genbackupdata---generate data for backups +author: Lars Wirzenius +date: SEE GIT +... + + +# Introduction + +`genbackupdata` is a utility for generating data for testing backup +software, specifically [Obnam][]. It is particularly intended for +generating reproducible synthetic data for benchmarking. It is often +desireable for a benchmark to be run by multiple parties, such that +all but one variable is controller for. For example, comparing the +same version of the backup software on two different computers. This +requires the benchmark data to be the same, as well. + +[Obnam]: http://obnam.org/ + +Any reasonable benchmarking will require a lot of data, and sharing +that is expensive. Thus, genbackupdata could be considered a +specialised compression tool: if two parties run the same version of +genbackupdata with the same argument, the output should be bitwise +identical. Thus, a program of a few tens of kilobytes of course code +would replace a data set of any size. + +The data generated by genbackupdata is random binary junk, using the +RC4 algorithm. It is meant to be uncompressible and non-repeating, to +be a worst case for backup software like Obnam. + +In addition to just generating data, genbackupdata generates a +directory tree, with files of a desired size. It turns out that backup +software have a per-file cost, and thus backing up a single gigabyte +file is likely to be less expensive than having a billion one-byte +files. + +genbackupdata is about the simplest possible implementation of these +ideas. It could be improved in many ways, such as by producing +different kinds of data (text in various languages, completely or +partially duplicated files) or file sizes of a suitable statistical +distribution. However, it sufficies for Obnam development, and thus +the author has no incentive to develop it further. If someone wants to +take over and make the software more versatile, they should feel free +to do so. + +## About this manual + +This manual gives an overview of how genbackupdata can be used. For +detailed usage information, please see the manual page or the output +of `genbackupdata --help`. + +The other purpose of this manual is to act as an automated integration +test suite for genbackupdata. Run this manual source code through the +[yarn][] tool to run the tests. + +[yarn]: http://liw.fi/cmdtest/ + + +# Simple usage + +The simplest way to use genbackupdata is to tell it to generate the +desired amount of data. The amount is given with the `--create` +option, which takes argument giving the size in bytes. + + SCENARIO generate some data + WHEN user runs genbackupdata --create=100 foo + THEN directory foo contains 100 bytes in files + +The `--create` size may also be given in bigger units (kilobytes, +megabytes, etc), using suffixes, such as `k` for kilobyte (1000 +bytes). + + WHEN user runs genbackupdata --create=100k bar + THEN directory bar contains 100000 bytes in files + +Further, the data is mostly uncompressible. + + AND directory bar is about 100000 bytes when compressed + +# Multiple runs + +Every run of genbackupdata produces the same sequence of random bytes. +Running it twice with the same arguments will produce the same data +twice. Since genbackupdata does not overwrite existing files, the data +is highly compressible now. + + SCENARIO run genbackupdata twice + WHEN user runs genbackupdata --create=100k foo + AND user runs genbackupdata --create=100k foo + THEN directory foo contains 200000 bytes in files + AND all files in foo are duplicates + +# Control file size + +The maximum size of output files can be specified. This allows the +user to generate single, very large file, or a large number of small +files. + + SCENARIO control file size + WHEN user runs genbackupdata --create=100k --file-size=1m bigfile + THEN directory bigfile contains 1 file + + WHEN user runs genbackupdata --create=1000 --file-size=1 manyfiles + THEN directory manyfiles contains 1000 files + +# Appendix: scenario step implementations + +This chapter implements the various scenario steps used in this +manual. + + IMPLEMENTS WHEN user runs genbackupdata --create=(\S+) (.+) + import os + import cliapp + size = os.environ['MATCH_1'] + args = os.environ['MATCH_2'].split() + opts = args[:-1] + dirname = os.path.join(os.environ['DATADIR'], args[-1]) + bin = os.path.join(os.environ['SRCDIR'], 'genbackupdata') + cliapp.runcmd([bin, '--create', size] + opts + [dirname]) + + IMPLEMENTS THEN directory (\S+) contains (\d+) bytes in files + import os + root = os.path.join(os.environ['DATADIR'], os.environ['MATCH_1']) + wanted_bytes = int(os.environ['MATCH_2']) + total_bytes = 0 + for dirname, subdirs, filenames in os.walk(root): + for filename in filenames: + pathname = os.path.join(dirname, filename) + print pathname, os.path.getsize(pathname) + total_bytes += os.path.getsize(pathname) + assert wanted_bytes == total_bytes, \ + '%s != %s' % (wanted_bytes, total_bytes) + + IMPLEMENTS THEN directory (\S+) is about (\d+) bytes when compressed + import os + import zlib + root = os.path.join(os.environ['DATADIR'], os.environ['MATCH_1']) + wanted_bytes = int(os.environ['MATCH_2']) + data = '' + for dirname, subdirs, filenames in os.walk(root): + for filename in filenames: + pathname = os.path.join(dirname, filename) + with open(pathname) as f: + data += f.read() + compressed = zlib.compress(data) + size_delta = len(compressed) - len(data) + print 'data:', len(data) + print 'compressed:', len(compressed) + print 'size_delta:', size_delta + assert abs(size_delta) < 1000 + + IMPLEMENTS THEN all files in (\S+) are duplicates + import collections + import os + root = os.path.join(os.environ['DATADIR'], os.environ['MATCH_1']) + files = collections.Counter() + for dirname, subdirs, filenames in os.walk(root): + for filename in filenames: + pathname = os.path.join(dirname, filename) + with open(pathname) as f: + data = f.read() + files[data] += 1 + for data in files: + assert files[data] == 2 + + IMPLEMENTS THEN directory (\S+) contains (\d+) files? + import collections + import os + root = os.path.join(os.environ['DATADIR'], os.environ['MATCH_1']) + wanted_count = int(os.environ['MATCH_2']) + file_count = 0 + for dirname, subdirs, filenames in os.walk(root): + file_count += len(filenames) + assert file_count == wanted_count -- cgit v1.2.1