diff options
Diffstat (limited to 'genbackupdata.py')
-rw-r--r-- | genbackupdata.py | 660 |
1 files changed, 0 insertions, 660 deletions
diff --git a/genbackupdata.py b/genbackupdata.py deleted file mode 100644 index 2fbb416..0000000 --- a/genbackupdata.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (C) 2007 Lars Wirzenius <liw@iki.fi> -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - -"""Generate backup test data""" - - -import hashlib -import optparse -import os -import random -import sys - - -KiB = 2 ** 10 # A kibibyte -MiB = 2 ** 20 # A mebibyte -GiB = 2 ** 30 # A gibibyte -TiB = 2 ** 40 # A tebibyte - -# Defaults for various settings in the BackupData class. -DEFAULT_SEED = 0 -DEFAULT_BINARY_CHUNK_SIZE = KiB -DEFAULT_TEXT_FILE_SIZE = 10 * KiB -DEFAULT_BINARY_FILE_SIZE = 10 * MiB -DEFAULT_TEXT_DATA_PERCENTAGE = 10.0 -DEFAULT_MAX_FILES_PER_DIRECTORY = 256 -DEFAULT_MODIFY_PERCENTAGE = 10 - -# Random filler text for generating text data. -if True: #pragma: no cover: - LOREM_IPSUM = """ -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod -tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim -veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea -commodo consequat. Duis aute irure dolor in reprehenderit in voluptate -velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint -occaecat cupidatat non proident, sunt in culpa qui officia deserunt -mollit anim id est laborum. -""" - - -class BackupData: - - """This class represents the directory with backup data""" - - def __init__(self): - self._dirname = None - self._seed = 0 - self._prng = None - self._chunk_size = DEFAULT_BINARY_CHUNK_SIZE - self._text_file_size = DEFAULT_TEXT_FILE_SIZE - self._binary_file_size = DEFAULT_BINARY_FILE_SIZE - self._text_data_percentage = DEFAULT_TEXT_DATA_PERCENTAGE - self._max_files_per_directory = DEFAULT_MAX_FILES_PER_DIRECTORY - self._modify_percentage = DEFAULT_MODIFY_PERCENTAGE - self._preexisting_file_count = 0 - self._preexisting_data_size = 0 - self._filename_counter = 0 - self._current_dir_no = 0 - self._next_filecount = 0 - - # The zlib compression algorithm gives up if it gets a block of - # 32 KiB bytes it can't find in its dictionary. It completely - # ignores such a block, meaning that if it is repeated, then - # it ignores it repeatedly. Most importantly for us, it doesn't - # compress the repeats, either. Thus, to generate lots of - # uncompressible binary data, we can generate a blob and repeat - # that. Thanks to Richard Braakman for the idea. - self._binary_blob_size = 64 * 1024 # Safety factor of 2 - self._binary_blob = None - - def set_directory(self, dirname): - """Set the directory to be operated on - - Note that this must be set exactly once. Setting it twice will cause - an assertion error, and not setting it will cause other errors. - - """ - - assert self._dirname is None - self._dirname = dirname - - def get_directory(self): - """Return the directory being operated on, or None if not set""" - return self._dirname - - def create_directory(self): - """Create the backup data directory, if it doesn't exist already""" - if not os.path.exists(self._dirname): - os.mkdir(self._dirname) - - def get_seed(self): - """Return the initial seed for the pseudo-random number generator""" - return self._seed - - def set_seed(self, seed): - """Set the initial seed for the pseudo-random number generator - - The seed will be used when the generator is first initialized. - It is initialized implicitly as soon as something in this class - needs randomness. Setting the seed after the generator has been - initialized causes an assertion failure. - - """ - - assert self.get_prng() is None - self._seed = seed - - def get_prng(self): - """Return reference to the psuedo-random number generator being used - - Return None, if one hasn't be initialized yet. - - """ - - return self._prng - - def init_prng(self): - """Initialize the psuedo-random number generator (using seed)""" - if self._prng is None: - self._prng = random.Random() - self._prng.seed(self._seed) - - def get_text_file_size(self): - """Return size of newly created text files""" - return self._text_file_size - - def set_text_file_size(self, size): - """Set size of newly created text files""" - self._text_file_size = size - - def get_binary_file_size(self): - """Return size of newly created binary files""" - return self._binary_file_size - - def set_binary_file_size(self, size): - """Set size of newly created binary files""" - self._binary_file_size = size - - def get_text_data_percentage(self): - """Return percentage of text data of new data that gets created""" - return self._text_data_percentage - - def set_text_data_percentage(self, percent): - """Set percentage of text data of new data that gets created""" - self._text_data_percentage = percent - - def get_max_files_per_directory(self): - """Return current setting of maximum number of files per directory""" - return self._max_files_per_directory - - def set_max_files_per_directory(self, count): - """Set maximum number of files per directory""" - self._max_files_per_directory = count - - def get_preexisting_file_count(self): - """Return count of files that existed in directory in the beginning""" - return self._preexisting_file_count - - def set_preexisting_file_count(self, count): - """Set count of files that existed in directory in the beginning - - This is useful only for unit tests. - - """ - self._preexisting_file_count = count - - def get_preexisting_data_size(self): - """Return size of data that existed in directory in the beginning""" - return self._preexisting_data_size - - def set_preexisting_data_size(self, size): - """Set size of data that existed in directory in the beginning - - This is useful only for unit tests. - - """ - self._preexisting_data_size = size - - def get_relative_file_count(self, percent): - """Return PERCENT percent of pre-existing file count""" - return int(0.01 * percent * self.get_preexisting_file_count()) - - def get_relative_data_size(self, percent): - """Return PERCENT percent of pre-existing data""" - return int(0.01 * percent * self.get_preexisting_data_size()) - - def find_preexisting_files(self): - """Find all the files that exists in the directory right now""" - count = 0 - size = 0 - if os.path.exists(self._dirname): - for root, dirs, filenames in os.walk(self._dirname): - count += len(filenames) - for filename in filenames: - size += os.path.getsize(os.path.join(root, filename)) - self.set_preexisting_file_count(count) - self.set_preexisting_data_size(size) - - def _choose_directory(self): - """Choose directory in which to create the next file""" - - while True: - dirname = os.path.join(self._dirname, - "dir%d" % self._current_dir_no) - if not os.path.exists(dirname): - self._next_filecount = 0 - break - if (self._next_filecount > 0 and - self._next_filecount < self._max_files_per_directory): - break - self._current_dir_no += 1 - return dirname - - def next_filename(self): - """Choose the name of the next filename - - The file does not currently exist. This is not, however, a guarantee - that no other process won't create it before we do. Thus, this - is NOT a secure way to create temporary files. But it's good enough - for our intended purpose. - - For simplified unit testing, the names are very easily predictable, - but it is probably a bad idea for external code to rely on this. - - """ - - dirname = self._choose_directory() - while True: - filename = os.path.join(dirname, - "file%d" % self._filename_counter) - if not os.path.exists(filename): - self._next_filecount += 1 - return filename - self._filename_counter += 1 - - def generate_text_data(self, size): - """Generate SIZE characters of text data""" - if size <= len(LOREM_IPSUM): - return LOREM_IPSUM[:size] - else: - full = size / len(LOREM_IPSUM) - rest = size % len(LOREM_IPSUM) - return "".join(([LOREM_IPSUM] * full) + [LOREM_IPSUM[:rest]]) - - def generate_binary_data_well(self, size): - """Generate SIZE bytes of more or less random binary junk""" - - # The following code has had some fine manual fine tuning done - # to it. This has made it a bit ugly, but faster. On a - # "Intel(R) Core(TM)2 Duo CPU L9400 @ 1.86GHz", it produces - # about 25 MB/s. - - chunks = [] - sum = hashlib.sha1() - chunk_size = len(sum.digest()) - - initial_bytes = min(size, 128) - for i in range(initial_bytes / chunk_size): - sum.update(chr(random.getrandbits(8))) - chunk = sum.digest() - chunks.append(chunk) - - size -= len(chunks) * chunk_size - for i in range(size / chunk_size): - sum.update("a") - chunk = sum.digest() - chunks.append(chunk) - - if size % chunk_size > 0: - sum.update(chr(random.getrandbits(8))) - chunk = sum.digest() - chunks.append(chunk[:size % chunk_size]) - - return "".join(chunks) - - - def generate_binary_data(self, size): - """Generate SIZE bytes of binary junk. - - This is different from generate_binary_data_well in that - it makes use of _binary_blob (and generates that if it does - not yet exist). - - """ - - if self._binary_blob is None: - self._binary_blob = self.generate_binary_data_well( - self._binary_blob_size) - if size <= len(self._binary_blob): - return self._binary_blob[:size] - else: - full = size / len(self._binary_blob) - rest = size % len(self._binary_blob) - return "".join(([self._binary_blob] * full) + - [self._binary_blob[:rest]]) - - def create_subdirectories(self, filename): - """Create the sub-directories that are needed to create filename""" - subdir = os.path.dirname(filename) - if not os.path.exists(subdir): - os.makedirs(subdir) - - def create_text_file(self, size): - """Create a new text file of the desired size""" - filename = self.next_filename() - self.create_subdirectories(filename) - f = file(filename, "w") - f.write(self.generate_text_data(size)) - f.close() - - def get_binary_chunk_size(self): - """Return the size of chunks used when writing binary data""" - return self._chunk_size - - def set_binary_chunk_size(self, size): - """Set the size of chunks used when writing binary data""" - self._chunk_size = size - - def create_binary_file(self, size): - """Create a new binary file of the desired size""" - filename = self.next_filename() - self.create_subdirectories(filename) - f = file(filename, "w") - # We write the data in chunks, so as not to keep the entire file - # contents in memory at a time. Since the size may be very large, - # we might otherwise run out of swap. - while size >= self._chunk_size: - f.write(self.generate_binary_data(self._chunk_size)) - size -= self._chunk_size - f.write(self.generate_binary_data(size)) - f.close() - - def _create_files_of_a_kind(self, size, file_size, create_one): - """Create files with create_one""" - while size > 0: - this_size = min(size, file_size) - create_one(this_size) - size -= this_size - - def create_files(self, size): - """Create new files, totalling SIZE bytes in size""" - text_size = int(0.01 * self._text_data_percentage * size) - bin_size = size - text_size - - self._create_files_of_a_kind(text_size, self.get_text_file_size(), - self.create_text_file) - self._create_files_of_a_kind(bin_size, self.get_binary_file_size(), - self.create_binary_file) - - def find_files(self): - """Find all non-directory files in the test data set""" - files = [] - for root, dirs, filenames in os.walk(self._dirname): - for filename in filenames: - files.append(os.path.join(root, filename)) - return files - - def choose_files_randomly(self, count): - """Choose COUNT files randomly""" - files = self.find_files() - if len(files) >= count: - self.init_prng() - files = self._prng.sample(files, count) - return files - - def delete_files(self, count): - """Delete COUNT files""" - if os.path.exists(self._dirname): - for file in self.choose_files_randomly(count): - os.remove(file) - - def rename_files(self, count): - """Rename COUNT files to new names""" - if os.path.exists(self._dirname): - for file in self.choose_files_randomly(count): - new_file = self.next_filename() - self.create_subdirectories(new_file) - os.rename(file, new_file) - - def link_files(self, count): - """Create COUNT new filenames that are hard links to existing files""" - if os.path.exists(self._dirname): - for file in self.choose_files_randomly(count): - new_file = self.next_filename() - self.create_subdirectories(new_file) - os.link(file, new_file) - - def get_modify_percentage(self): - """Return how many percent to grow each file with modify_files()""" - return self._modify_percentage - - def set_modify_percentage(self, percent): - """Set how many percent to grow each file with modify_files()""" - self._modify_percentage = percent - - def append_data(self, filename, data): - """Append data to a file""" - f = file(filename, "a") - f.write(data) - f.close() - - def _modify_files_of_a_kind(self, filenames, size, generate_data): - """Modify files by appending data to them""" - while size > 0: - filename = self._prng.choice(filenames) - this_size = os.path.getsize(filename) - amount = min(int(0.01 * self._modify_percentage * this_size), - size) - self.append_data(filename, generate_data(amount)) - size -= amount - - def modify_files(self, size): - """Modify existing files by appending to them - - SIZE gives the total amount of new data for all files. - Files are chosen at random, and new data is appended to them. - The amount appended to each file is set by - set_modify_percentage. The data is split between text and - binary data according to set_text_data_percentage. - - """ - - if os.path.exists(self._dirname): - files = self.find_files() - - text_size = int(0.01 * self._text_data_percentage * size) - bin_size = size - text_size - - self.init_prng() - self._modify_files_of_a_kind(files, text_size, - self.generate_text_data) - self._modify_files_of_a_kind(files, bin_size, - self.generate_binary_data) - - - -class CommandLineParser: - - """Parse the command line for the genbackupdata utility""" - - def __init__(self, backup_data): - self._bd = backup_data - self._parser = self._create_option_parser() - - def _create_option_parser(self): - """Create the OptionParser we need""" - - p = optparse.OptionParser() - - p.add_option("--seed", - help="Set pseudo-random number generator seed to SEED") - - p.add_option("--max-count", - action="store", - metavar="COUNT", - help="Allow at most COUNT files per directory") - - p.add_option("-p", "--percentage-text-data", - action="store", - metavar="PERCENT", - help="Make PERCENT of new data textual, not binary") - - p.add_option("-t", "--text-file-size", - action="store", - metavar="SIZE", - help="Make new text files be of size SIZE") - - p.add_option("-b", "--binary-file-size", - action="store", - metavar="SIZE", - help="Make new binary files be of size SIZE") - - p.add_option("-c", "--create", - action="store", - metavar="SIZE", - help="Create SIZE amount of new files") - - p.add_option("-d", "--delete", - action="store", - metavar="COUNT", - help="Delete COUNT files") - - p.add_option("-r", "--rename", - action="store", - metavar="COUNT", - help="Rename COUNT files") - - p.add_option("-l", "--link", - action="store", - metavar="COUNT", - help="Create COUNT new hard links") - - p.add_option("-m", "--modify", - action="store", - metavar="SIZE", - help="Grow total data size by SIZE") - - p.add_option("--modify-percentage", - action="store", - metavar="PERCENT", - help="Increase file size by PERCENT") - - return p - - def parse_size(self, size, base_size=None): - """Parse a SIZE argument (absolute, relative, with/without suffix)""" - - suffixes = (("k", KiB), ("m", MiB), ("g", GiB), ("t", TiB)) - - for suffix, factor in suffixes: - if size.lower().endswith(suffix): - return int(float(size[:-len(suffix)]) * factor) - - if size.endswith("%"): - if base_size is None: - return 0 - else: - return int(float(size[:-1]) * 0.01 * base_size) - - return int(size) - - def parse_count(self, count, base_count=None): - """Parse a COUNT argument (absolute, relative, with/without suffix)""" - - suffixes = (("k", 10**3), ("m", 10**6), ("g", 10**9), ("t", 10**12)) - - for suffix, factor in suffixes: - if count.lower().endswith(suffix): - return int(float(count[:-len(suffix)]) * factor) - - if count.endswith("%"): - if base_count is None: - return 0 - else: - return int(float(count[:-1]) * 0.01 * base_count) - - return int(count) - - def parse(self, args): - """Parse command line arguments""" - options, args = self._parser.parse_args(args) - - if len(args) == 1: - self._bd.set_directory(args[0]) - self._bd.find_preexisting_files() - - if options.seed: - self._bd.set_seed(int(options.seed)) - - if options.max_count: - self._bd.set_max_files_per_directory(int(options.max_count)) - - if options.percentage_text_data: - self._bd.set_text_data_percentage( - float(options.percentage_text_data)) - - if options.modify_percentage: - self._bd.set_modify_percentage(float(options.modify_percentage)) - - if options.text_file_size: - self._bd.set_text_file_size( - self.parse_size(options.text_file_size)) - - if options.binary_file_size: - self._bd.set_binary_file_size( - self.parse_size(options.binary_file_size)) - - if options.create: - options.create = self.parse_size(options.create, - self._bd.get_preexisting_data_size()) - - if options.modify: - options.modify = self.parse_size(options.modify, - self._bd.get_preexisting_data_size()) - - if options.delete: - options.delete = self.parse_count(options.delete, - self._bd.get_preexisting_file_count()) - - if options.rename: - options.rename = self.parse_count(options.rename, - self._bd.get_preexisting_file_count()) - - if options.link: - options.link = self.parse_count(options.link, - self._bd.get_preexisting_file_count()) - - return options, args - - -class AppException(Exception): - - def __str__(self): - return self._str - - -class NeedExactlyOneDirectoryName(AppException): - - def __init__(self): - self._str = ("Need exactly one command line argument, " - "giving directory name") - - -class Application: - - """The main program""" - - def __init__(self, args): - self._args = args - self._bd = BackupData() - self._clp = CommandLineParser(self._bd) - self._error = sys.stderr.write - - def set_error_writer(self, writer): - self._error = writer - - def run(self): - """Execute the desired operations""" - try: - options, args = self._clp.parse(self._args) - - if len(args) != 1: - raise NeedExactlyOneDirectoryName() - - if options.delete: - self._bd.delete_files(options.delete) - - if options.rename: - self._bd.rename_files(options.rename) - - if options.link: - self._bd.link_files(options.link) - - if options.modify: - self._bd.modify_files(options.modify) - - if options.create: - self._bd.create_files(options.create) - - except AppException, e: - self._error(str(e) + "\n") - sys.exit(1) - - -if __name__ == "__main__": #pragma: no cover - Application(sys.argv[1:]).run() |