diff options
author | Lars Wirzenius <liw@liw.fi> | 2015-03-27 20:25:14 +0200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2015-03-27 20:25:14 +0200 |
commit | 1c5fa8116401a9c886aa042ad16160814976d510 (patch) | |
tree | 0edd708b6cdcefe28e96a99fd1c87995bf33a64c | |
parent | 2cf66368b9ee638d8505ead3e59a9f5524355a67 (diff) | |
parent | cc5f5029ba8ba32034f47b70ff3d1cfb3f56581d (diff) | |
download | genbackupdata-1c5fa8116401a9c886aa042ad16160814976d510.tar.gz |
Use RC4 to generate less repetitive junk
-rw-r--r-- | NEWS | 6 | ||||
-rwxr-xr-x | genbackupdata | 77 | ||||
-rw-r--r-- | genbackupdatalib/generator.py | 57 |
3 files changed, 65 insertions, 75 deletions
@@ -1,6 +1,12 @@ NEWS for genbackupdata ====================== +Version 1.8, released UNRELEASED +-------------------------------- + +* Change how binary data gets generated. It is now much less repetetive. + Suggested by Rob Kendrick. + Version 1.7, released 2012-09-29 -------------------------------- diff --git a/genbackupdata b/genbackupdata index aa52903..168961b 100755 --- a/genbackupdata +++ b/genbackupdata @@ -26,34 +26,39 @@ import genbackupdatalib class GenbackupdataApp(cliapp.Application): def add_settings(self): - self.settings.bytesize(['create', 'c'], - 'how much data to create (default: %default)') - self.settings.bytesize(['file-size'], - 'size of one file (default: %default)', - default=16*1024) - self.settings.bytesize(['chunk-size'], - 'generate data in chunks of this size ' - '(default: %default)', - default=16*1024) - self.settings.integer(['depth'], - 'depth of directory tree (default: %default)', - default=3) - self.settings.integer(['max-files'], - 'max files/dirs per dir (default: %default)', - default=128) - self.settings.integer(['seed'], - 'seed for random number generator ' - '(default: %default)', - default=0) - self.settings.boolean(['quiet'], 'do not report progress') + self.settings.bytesize( + ['create', 'c'], + 'how much data to create (default: %default)') + self.settings.bytesize( + ['file-size'], + 'size of one file', + default=16*1024) + self.settings.bytesize( + ['chunk-size'], + 'generate data in chunks of this size', + default=16*1024) + self.settings.integer( + ['depth'], + 'depth of directory tree', + default=3) + self.settings.integer( + ['max-files'], + 'max files/dirs per dir', + default=128) + self.settings.integer( + ['seed'], + 'seed for random number generator', + default=0) + self.settings.boolean( + ['quiet'], + 'do not report progress') def process_args(self, args): outputdir = args[0] bytes = self.settings['create'] self.gen = genbackupdatalib.DataGenerator(self.settings['seed']) - self.names = genbackupdatalib.NameGenerator(outputdir, - self.settings['depth'], - self.settings['max-files']) + self.names = genbackupdatalib.NameGenerator( + outputdir, self.settings['depth'], self.settings['max-files']) self.setup_ttystatus() self.status['total'] = bytes @@ -72,13 +77,12 @@ class GenbackupdataApp(cliapp.Application): dirname = os.path.dirname(pathname) if not os.path.exists(dirname): os.makedirs(dirname) - f = open(pathname, 'wb') - while bytes >= chunk_size: - self.write_bytes(f, chunk_size) - bytes -= chunk_size - if bytes > 0: - self.write_bytes(f, bytes) - f.close() + with open(pathname, 'wb') as f: + while bytes >= chunk_size: + self.write_bytes(f, chunk_size) + bytes -= chunk_size + if bytes > 0: + self.write_bytes(f, bytes) def write_bytes(self, f, bytes): chunk = self.gen.generate(bytes) @@ -91,17 +95,10 @@ class GenbackupdataApp(cliapp.Application): self.status.disable() self.status['written'] = 0 self.status['total'] = 0 - self.status.add(ttystatus.Literal('Generating: ')) - self.status.add(ttystatus.ByteSize('written')) - self.status.add(ttystatus.Literal(' of ')) - self.status.add(ttystatus.ByteSize('total')) - self.status.add(ttystatus.Literal(' ')) - self.status.add(ttystatus.PercentDone('written', 'total')) - self.status.add(ttystatus.Literal(' (')) - self.status.add(ttystatus.ByteSpeed('written')) - self.status.add(ttystatus.Literal(')')) + self.status.format( + 'Generating %ByteSize(written) of %ByteSize(total) ' + '%PercentDone(written,total) (%ByteSpeed(written))') if __name__ == '__main__': GenbackupdataApp().run() - diff --git a/genbackupdatalib/generator.py b/genbackupdatalib/generator.py index 8cf349c..9e3dea2 100644 --- a/genbackupdatalib/generator.py +++ b/genbackupdatalib/generator.py @@ -14,49 +14,36 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. -import random import struct +import Crypto.Cipher.ARC4 + class DataGenerator(object): '''Generate random binary data.''' - # We generate data by using a blob of suitable size. The output - # sequence repeats the blob, where each repetition is preceded by - # a 64-bit counter. - # - # We need to be relatively prime with obnam's chunk size, which - # defaults to 64 KiB (65536 bytes). This is so that obnam does not - # notice a lot of duplicated data, resulting in unrealistically - # high amounts of compression in the backup store. - # - # Ideally, we would not generate any repeating data, but the random - # number generator is not fast enough for that. We need to generate - # data about as fast as the disk can write it, and the random number - # generator is orders of magnitude slower than that. - - _blob_size = 65521 - _blob_size = 1021 + _data = 'x' * 1024**2 def __init__(self, seed): - self._random = random.Random(seed) - self._blob = self._generate_blob() - self._counter = 0 - self._buffer = '' - - def _generate_blob(self): - return ''.join(chr(self._random.randint(0, 255)) - for i in range(self._blob_size)) - + key = struct.pack('!Q', seed) + self._arc4 = Crypto.Cipher.ARC4.new(key) + self._buffer = [] + self._buffer_length = 0 + def generate(self, size): - while size > len(self._buffer): - self._buffer += self._generate_more_data() - data = self._buffer[:size] - self._buffer = self._buffer[size:] + while self._buffer_length < size: + self._generate_junk() + return self._split_off_data(size) + + def _generate_junk(self): + junk = self._arc4.encrypt(self._data) + self._buffer.append(junk) + self._buffer_length += len(junk) + + def _split_off_data(self, size): + self._buffer = [''.join(self._buffer)] + data = self._buffer[0][:size] + self._buffer[0] = self._buffer[0][size:] + self._buffer_length -= len(data) return data - - def _generate_more_data(self): - self._counter += 1 - return struct.pack('!Q', self._counter) + self._blob - |