summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS6
-rwxr-xr-xgenbackupdata77
-rw-r--r--genbackupdatalib/generator.py57
3 files changed, 65 insertions, 75 deletions
diff --git a/NEWS b/NEWS
index 882cb8a..e3569ae 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,12 @@
NEWS for genbackupdata
======================
+Version 1.8, released UNRELEASED
+--------------------------------
+
+* Change how binary data gets generated. It is now much less repetetive.
+ Suggested by Rob Kendrick.
+
Version 1.7, released 2012-09-29
--------------------------------
diff --git a/genbackupdata b/genbackupdata
index aa52903..168961b 100755
--- a/genbackupdata
+++ b/genbackupdata
@@ -26,34 +26,39 @@ import genbackupdatalib
class GenbackupdataApp(cliapp.Application):
def add_settings(self):
- self.settings.bytesize(['create', 'c'],
- 'how much data to create (default: %default)')
- self.settings.bytesize(['file-size'],
- 'size of one file (default: %default)',
- default=16*1024)
- self.settings.bytesize(['chunk-size'],
- 'generate data in chunks of this size '
- '(default: %default)',
- default=16*1024)
- self.settings.integer(['depth'],
- 'depth of directory tree (default: %default)',
- default=3)
- self.settings.integer(['max-files'],
- 'max files/dirs per dir (default: %default)',
- default=128)
- self.settings.integer(['seed'],
- 'seed for random number generator '
- '(default: %default)',
- default=0)
- self.settings.boolean(['quiet'], 'do not report progress')
+ self.settings.bytesize(
+ ['create', 'c'],
+ 'how much data to create (default: %default)')
+ self.settings.bytesize(
+ ['file-size'],
+ 'size of one file',
+ default=16*1024)
+ self.settings.bytesize(
+ ['chunk-size'],
+ 'generate data in chunks of this size',
+ default=16*1024)
+ self.settings.integer(
+ ['depth'],
+ 'depth of directory tree',
+ default=3)
+ self.settings.integer(
+ ['max-files'],
+ 'max files/dirs per dir',
+ default=128)
+ self.settings.integer(
+ ['seed'],
+ 'seed for random number generator',
+ default=0)
+ self.settings.boolean(
+ ['quiet'],
+ 'do not report progress')
def process_args(self, args):
outputdir = args[0]
bytes = self.settings['create']
self.gen = genbackupdatalib.DataGenerator(self.settings['seed'])
- self.names = genbackupdatalib.NameGenerator(outputdir,
- self.settings['depth'],
- self.settings['max-files'])
+ self.names = genbackupdatalib.NameGenerator(
+ outputdir, self.settings['depth'], self.settings['max-files'])
self.setup_ttystatus()
self.status['total'] = bytes
@@ -72,13 +77,12 @@ class GenbackupdataApp(cliapp.Application):
dirname = os.path.dirname(pathname)
if not os.path.exists(dirname):
os.makedirs(dirname)
- f = open(pathname, 'wb')
- while bytes >= chunk_size:
- self.write_bytes(f, chunk_size)
- bytes -= chunk_size
- if bytes > 0:
- self.write_bytes(f, bytes)
- f.close()
+ with open(pathname, 'wb') as f:
+ while bytes >= chunk_size:
+ self.write_bytes(f, chunk_size)
+ bytes -= chunk_size
+ if bytes > 0:
+ self.write_bytes(f, bytes)
def write_bytes(self, f, bytes):
chunk = self.gen.generate(bytes)
@@ -91,17 +95,10 @@ class GenbackupdataApp(cliapp.Application):
self.status.disable()
self.status['written'] = 0
self.status['total'] = 0
- self.status.add(ttystatus.Literal('Generating: '))
- self.status.add(ttystatus.ByteSize('written'))
- self.status.add(ttystatus.Literal(' of '))
- self.status.add(ttystatus.ByteSize('total'))
- self.status.add(ttystatus.Literal(' '))
- self.status.add(ttystatus.PercentDone('written', 'total'))
- self.status.add(ttystatus.Literal(' ('))
- self.status.add(ttystatus.ByteSpeed('written'))
- self.status.add(ttystatus.Literal(')'))
+ self.status.format(
+ 'Generating %ByteSize(written) of %ByteSize(total) '
+ '%PercentDone(written,total) (%ByteSpeed(written))')
if __name__ == '__main__':
GenbackupdataApp().run()
-
diff --git a/genbackupdatalib/generator.py b/genbackupdatalib/generator.py
index 8cf349c..9e3dea2 100644
--- a/genbackupdatalib/generator.py
+++ b/genbackupdatalib/generator.py
@@ -14,49 +14,36 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-import random
import struct
+import Crypto.Cipher.ARC4
+
class DataGenerator(object):
'''Generate random binary data.'''
- # We generate data by using a blob of suitable size. The output
- # sequence repeats the blob, where each repetition is preceded by
- # a 64-bit counter.
- #
- # We need to be relatively prime with obnam's chunk size, which
- # defaults to 64 KiB (65536 bytes). This is so that obnam does not
- # notice a lot of duplicated data, resulting in unrealistically
- # high amounts of compression in the backup store.
- #
- # Ideally, we would not generate any repeating data, but the random
- # number generator is not fast enough for that. We need to generate
- # data about as fast as the disk can write it, and the random number
- # generator is orders of magnitude slower than that.
-
- _blob_size = 65521
- _blob_size = 1021
+ _data = 'x' * 1024**2
def __init__(self, seed):
- self._random = random.Random(seed)
- self._blob = self._generate_blob()
- self._counter = 0
- self._buffer = ''
-
- def _generate_blob(self):
- return ''.join(chr(self._random.randint(0, 255))
- for i in range(self._blob_size))
-
+ key = struct.pack('!Q', seed)
+ self._arc4 = Crypto.Cipher.ARC4.new(key)
+ self._buffer = []
+ self._buffer_length = 0
+
def generate(self, size):
- while size > len(self._buffer):
- self._buffer += self._generate_more_data()
- data = self._buffer[:size]
- self._buffer = self._buffer[size:]
+ while self._buffer_length < size:
+ self._generate_junk()
+ return self._split_off_data(size)
+
+ def _generate_junk(self):
+ junk = self._arc4.encrypt(self._data)
+ self._buffer.append(junk)
+ self._buffer_length += len(junk)
+
+ def _split_off_data(self, size):
+ self._buffer = [''.join(self._buffer)]
+ data = self._buffer[0][:size]
+ self._buffer[0] = self._buffer[0][size:]
+ self._buffer_length -= len(data)
return data
-
- def _generate_more_data(self):
- self._counter += 1
- return struct.pack('!Q', self._counter) + self._blob
-