summaryrefslogtreecommitdiff
path: root/genbackupdata.py
blob: 2fbb4168d19582c4cef2ad70063bb5559104bb1a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
# Copyright (C) 2007  Lars Wirzenius <liw@iki.fi>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


"""Generate backup test data"""


import hashlib
import optparse
import os
import random
import sys


KiB = 2 ** 10   # A kibibyte
MiB = 2 ** 20   # A mebibyte
GiB = 2 ** 30   # A gibibyte
TiB = 2 ** 40   # A tebibyte

# Defaults for various settings in the BackupData class.
DEFAULT_SEED = 0
DEFAULT_BINARY_CHUNK_SIZE = KiB
DEFAULT_TEXT_FILE_SIZE = 10 * KiB
DEFAULT_BINARY_FILE_SIZE = 10 * MiB
DEFAULT_TEXT_DATA_PERCENTAGE = 10.0
DEFAULT_MAX_FILES_PER_DIRECTORY = 256
DEFAULT_MODIFY_PERCENTAGE = 10

# Random filler text for generating text data.
if True: #pragma: no cover:
    LOREM_IPSUM = """
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim
veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea
commodo consequat. Duis aute irure dolor in reprehenderit in voluptate
velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint
occaecat cupidatat non proident, sunt in culpa qui officia deserunt
mollit anim id est laborum.
"""


class BackupData:

    """This class represents the directory with backup data"""
    
    def __init__(self):
        self._dirname = None
        self._seed = 0
        self._prng = None
        self._chunk_size = DEFAULT_BINARY_CHUNK_SIZE
        self._text_file_size = DEFAULT_TEXT_FILE_SIZE
        self._binary_file_size = DEFAULT_BINARY_FILE_SIZE
        self._text_data_percentage = DEFAULT_TEXT_DATA_PERCENTAGE
        self._max_files_per_directory = DEFAULT_MAX_FILES_PER_DIRECTORY
        self._modify_percentage = DEFAULT_MODIFY_PERCENTAGE
        self._preexisting_file_count = 0
        self._preexisting_data_size = 0
        self._filename_counter = 0
        self._current_dir_no = 0
        self._next_filecount = 0
        
        # The zlib compression algorithm gives up if it gets a block of
        # 32 KiB bytes it can't find in its dictionary. It completely
        # ignores such a block, meaning that if it is repeated, then
        # it ignores it repeatedly. Most importantly for us, it doesn't
        # compress the repeats, either. Thus, to generate lots of
        # uncompressible binary data, we can generate a blob and repeat
        # that. Thanks to Richard Braakman for the idea.
        self._binary_blob_size = 64 * 1024 # Safety factor of 2
        self._binary_blob = None

    def set_directory(self, dirname):
        """Set the directory to be operated on
        
        Note that this must be set exactly once. Setting it twice will cause
        an assertion error, and not setting it will cause other errors.
        
        """
        
        assert self._dirname is None
        self._dirname = dirname
        
    def get_directory(self):
        """Return the directory being operated on, or None if not set"""
        return self._dirname
        
    def create_directory(self):
        """Create the backup data directory, if it doesn't exist already"""
        if not os.path.exists(self._dirname):
            os.mkdir(self._dirname)

    def get_seed(self):
        """Return the initial seed for the pseudo-random number generator"""
        return self._seed
        
    def set_seed(self, seed):
        """Set the initial seed for the pseudo-random number generator
        
        The seed will be used when the generator is first initialized.
        It is initialized implicitly as soon as something in this class
        needs randomness. Setting the seed after the generator has been
        initialized causes an assertion failure.
        
        """

        assert self.get_prng() is None
        self._seed = seed

    def get_prng(self):
        """Return reference to the psuedo-random number generator being used
        
        Return None, if one hasn't be initialized yet.
        
        """
        
        return self._prng
        
    def init_prng(self):
        """Initialize the psuedo-random number generator (using seed)"""
        if self._prng is None:
            self._prng = random.Random()
            self._prng.seed(self._seed)

    def get_text_file_size(self):
        """Return size of newly created text files"""
        return self._text_file_size

    def set_text_file_size(self, size):
        """Set size of newly created text files"""
        self._text_file_size = size

    def get_binary_file_size(self):
        """Return size of newly created binary files"""
        return self._binary_file_size

    def set_binary_file_size(self, size):
        """Set size of newly created binary files"""
        self._binary_file_size = size

    def get_text_data_percentage(self):
        """Return percentage of text data of new data that gets created"""
        return self._text_data_percentage

    def set_text_data_percentage(self, percent):
        """Set percentage of text data of new data that gets created"""
        self._text_data_percentage = percent

    def get_max_files_per_directory(self):
        """Return current setting of maximum number of files per directory"""
        return self._max_files_per_directory

    def set_max_files_per_directory(self, count):
        """Set maximum number of files per directory"""
        self._max_files_per_directory = count

    def get_preexisting_file_count(self):
        """Return count of files that existed in directory in the beginning"""
        return self._preexisting_file_count

    def set_preexisting_file_count(self, count):
        """Set count of files that existed in directory in the beginning
        
        This is useful only for unit tests.
        
        """
        self._preexisting_file_count = count

    def get_preexisting_data_size(self):
        """Return size of data that existed in directory in the beginning"""
        return self._preexisting_data_size

    def set_preexisting_data_size(self, size):
        """Set size of data that existed in directory in the beginning
        
        This is useful only for unit tests.
        
        """
        self._preexisting_data_size = size

    def get_relative_file_count(self, percent):
        """Return PERCENT percent of pre-existing file count"""
        return int(0.01 * percent * self.get_preexisting_file_count())

    def get_relative_data_size(self, percent):
        """Return PERCENT percent of pre-existing data"""
        return int(0.01 * percent * self.get_preexisting_data_size())

    def find_preexisting_files(self):
        """Find all the files that exists in the directory right now"""
        count = 0
        size = 0
        if os.path.exists(self._dirname):
            for root, dirs, filenames in os.walk(self._dirname):
                count += len(filenames)
                for filename in filenames:
                    size += os.path.getsize(os.path.join(root, filename))
        self.set_preexisting_file_count(count)
        self.set_preexisting_data_size(size)

    def _choose_directory(self):
        """Choose directory in which to create the next file"""
        
        while True:
            dirname = os.path.join(self._dirname, 
                                   "dir%d" % self._current_dir_no)
            if not os.path.exists(dirname):
                self._next_filecount = 0
                break
            if (self._next_filecount > 0 and
                self._next_filecount < self._max_files_per_directory):
                break
            self._current_dir_no += 1
        return dirname
        
    def next_filename(self):
        """Choose the name of the next filename
        
        The file does not currently exist. This is not, however, a guarantee
        that no other process won't create it before we do. Thus, this
        is NOT a secure way to create temporary files. But it's good enough
        for our intended purpose.
        
        For simplified unit testing, the names are very easily predictable,
        but it is probably a bad idea for external code to rely on this.
        
        """

        dirname = self._choose_directory()
        while True:
            filename = os.path.join(dirname, 
                                    "file%d" % self._filename_counter)
            if not os.path.exists(filename):
                self._next_filecount += 1
                return filename
            self._filename_counter += 1

    def generate_text_data(self, size):
        """Generate SIZE characters of text data"""
        if size <= len(LOREM_IPSUM):
            return LOREM_IPSUM[:size]
        else:
            full = size / len(LOREM_IPSUM)
            rest = size % len(LOREM_IPSUM)
            return "".join(([LOREM_IPSUM] * full) + [LOREM_IPSUM[:rest]])

    def generate_binary_data_well(self, size):
        """Generate SIZE bytes of more or less random binary junk"""
        
        # The following code has had some fine manual fine tuning done
        # to it. This has made it a bit ugly, but faster. On a 
        # "Intel(R) Core(TM)2 Duo CPU L9400 @ 1.86GHz", it produces
        # about 25 MB/s.

        chunks = []
        sum = hashlib.sha1()
        chunk_size = len(sum.digest())

        initial_bytes = min(size, 128)
        for i in range(initial_bytes / chunk_size):
            sum.update(chr(random.getrandbits(8)))
            chunk = sum.digest()
            chunks.append(chunk)

        size -= len(chunks) * chunk_size
        for i in range(size / chunk_size):
            sum.update("a")
            chunk = sum.digest()
            chunks.append(chunk)

        if size % chunk_size > 0:
            sum.update(chr(random.getrandbits(8)))
            chunk = sum.digest()
            chunks.append(chunk[:size % chunk_size])

        return "".join(chunks)


    def generate_binary_data(self, size):
        """Generate SIZE bytes of binary junk.
        
        This is different from generate_binary_data_well in that
        it makes use of _binary_blob (and generates that if it does
        not yet exist).
        
        """
        
        if self._binary_blob is None:
            self._binary_blob = self.generate_binary_data_well(
                                    self._binary_blob_size)
        if size <= len(self._binary_blob):
            return self._binary_blob[:size]
        else:
            full = size / len(self._binary_blob)
            rest = size % len(self._binary_blob)
            return "".join(([self._binary_blob] * full) + 
                           [self._binary_blob[:rest]])

    def create_subdirectories(self, filename):
        """Create the sub-directories that are needed to create filename"""
        subdir = os.path.dirname(filename)
        if not os.path.exists(subdir):
            os.makedirs(subdir)

    def create_text_file(self, size):
        """Create a new text file of the desired size"""
        filename = self.next_filename()
        self.create_subdirectories(filename)
        f = file(filename, "w")
        f.write(self.generate_text_data(size))
        f.close()

    def get_binary_chunk_size(self):
        """Return the size of chunks used when writing binary data"""
        return self._chunk_size

    def set_binary_chunk_size(self, size):
        """Set the size of chunks used when writing binary data"""
        self._chunk_size = size

    def create_binary_file(self, size):
        """Create a new binary file of the desired size"""
        filename = self.next_filename()
        self.create_subdirectories(filename)
        f = file(filename, "w")
        # We write the data in chunks, so as not to keep the entire file
        # contents in memory at a time. Since the size may be very large,
        # we might otherwise run out of swap.
        while size >= self._chunk_size:
            f.write(self.generate_binary_data(self._chunk_size))
            size -= self._chunk_size
        f.write(self.generate_binary_data(size))
        f.close()

    def _create_files_of_a_kind(self, size, file_size, create_one):
        """Create files with create_one"""
        while size > 0:
            this_size = min(size, file_size)
            create_one(this_size)
            size -= this_size

    def create_files(self, size):
        """Create new files, totalling SIZE bytes in size"""
        text_size = int(0.01 * self._text_data_percentage * size)
        bin_size = size - text_size

        self._create_files_of_a_kind(text_size, self.get_text_file_size(),
                                     self.create_text_file)
        self._create_files_of_a_kind(bin_size, self.get_binary_file_size(),
                                     self.create_binary_file)

    def find_files(self):
        """Find all non-directory files in the test data set"""
        files = []
        for root, dirs, filenames in os.walk(self._dirname):
            for filename in filenames:
                files.append(os.path.join(root, filename))
        return files

    def choose_files_randomly(self, count):
        """Choose COUNT files randomly"""
        files = self.find_files()
        if len(files) >= count:
            self.init_prng()
            files = self._prng.sample(files, count)
        return files

    def delete_files(self, count):
        """Delete COUNT files"""
        if os.path.exists(self._dirname):
            for file in self.choose_files_randomly(count):
                os.remove(file)

    def rename_files(self, count):
        """Rename COUNT files to new names"""
        if os.path.exists(self._dirname):
            for file in self.choose_files_randomly(count):
                new_file = self.next_filename()
                self.create_subdirectories(new_file)
                os.rename(file, new_file)

    def link_files(self, count):
        """Create COUNT new filenames that are hard links to existing files"""
        if os.path.exists(self._dirname):
            for file in self.choose_files_randomly(count):
                new_file = self.next_filename()
                self.create_subdirectories(new_file)
                os.link(file, new_file)

    def get_modify_percentage(self):
        """Return how many percent to grow each file with modify_files()"""
        return self._modify_percentage
        
    def set_modify_percentage(self, percent):
        """Set how many percent to grow each file with modify_files()"""
        self._modify_percentage = percent

    def append_data(self, filename, data):
        """Append data to a file"""
        f = file(filename, "a")
        f.write(data)
        f.close()

    def _modify_files_of_a_kind(self, filenames, size, generate_data):
        """Modify files by appending data to them"""
        while size > 0:
            filename = self._prng.choice(filenames)
            this_size = os.path.getsize(filename)
            amount = min(int(0.01 * self._modify_percentage * this_size),
                         size)
            self.append_data(filename, generate_data(amount))
            size -= amount

    def modify_files(self, size):
        """Modify existing files by appending to them
        
        SIZE gives the total amount of new data for all files.
        Files are chosen at random, and new data is appended to them.
        The amount appended to each file is set by
        set_modify_percentage. The data is split between text and
        binary data according to set_text_data_percentage.
        
        """
        
        if os.path.exists(self._dirname):
            files = self.find_files()

            text_size = int(0.01 * self._text_data_percentage * size)
            bin_size = size - text_size

            self.init_prng()
            self._modify_files_of_a_kind(files, text_size, 
                                         self.generate_text_data)
            self._modify_files_of_a_kind(files, bin_size, 
                                         self.generate_binary_data)



class CommandLineParser:

    """Parse the command line for the genbackupdata utility"""
    
    def __init__(self, backup_data):
        self._bd = backup_data
        self._parser = self._create_option_parser()

    def _create_option_parser(self):
        """Create the OptionParser we need"""
        
        p = optparse.OptionParser()

        p.add_option("--seed",
                     help="Set pseudo-random number generator seed to SEED")

        p.add_option("--max-count",
                     action="store",
                     metavar="COUNT",
                     help="Allow at most COUNT files per directory")

        p.add_option("-p", "--percentage-text-data",
                     action="store",
                     metavar="PERCENT",
                     help="Make PERCENT of new data textual, not binary")

        p.add_option("-t", "--text-file-size",
                     action="store",
                     metavar="SIZE",
                     help="Make new text files be of size SIZE")

        p.add_option("-b", "--binary-file-size",
                     action="store",
                     metavar="SIZE",
                     help="Make new binary files be of size SIZE")

        p.add_option("-c", "--create",
                     action="store",
                     metavar="SIZE",
                     help="Create SIZE amount of new files")

        p.add_option("-d", "--delete",
                     action="store",
                     metavar="COUNT",
                     help="Delete COUNT files")

        p.add_option("-r", "--rename",
                     action="store",
                     metavar="COUNT",
                     help="Rename COUNT files")

        p.add_option("-l", "--link",
                     action="store",
                     metavar="COUNT",
                     help="Create COUNT new hard links")

        p.add_option("-m", "--modify",
                     action="store",
                     metavar="SIZE",
                     help="Grow total data size by SIZE")

        p.add_option("--modify-percentage",
                     action="store",
                     metavar="PERCENT",
                     help="Increase file size by PERCENT")

        return p

    def parse_size(self, size, base_size=None):
        """Parse a SIZE argument (absolute, relative, with/without suffix)"""
        
        suffixes = (("k", KiB), ("m", MiB), ("g", GiB), ("t", TiB))

        for suffix, factor in suffixes:
            if size.lower().endswith(suffix):
                return int(float(size[:-len(suffix)]) * factor)

        if size.endswith("%"):
            if base_size is None:
                return 0
            else:
                return int(float(size[:-1]) * 0.01 * base_size)

        return int(size)

    def parse_count(self, count, base_count=None):
        """Parse a COUNT argument (absolute, relative, with/without suffix)"""
        
        suffixes = (("k", 10**3), ("m", 10**6), ("g", 10**9), ("t", 10**12))

        for suffix, factor in suffixes:
            if count.lower().endswith(suffix):
                return int(float(count[:-len(suffix)]) * factor)

        if count.endswith("%"):
            if base_count is None:
                return 0
            else:
                return int(float(count[:-1]) * 0.01 * base_count)

        return int(count)

    def parse(self, args):
        """Parse command line arguments"""
        options, args = self._parser.parse_args(args)
            
        if len(args) == 1:
            self._bd.set_directory(args[0])
            self._bd.find_preexisting_files()
        
        if options.seed:
            self._bd.set_seed(int(options.seed))
        
        if options.max_count:
            self._bd.set_max_files_per_directory(int(options.max_count))
        
        if options.percentage_text_data:
            self._bd.set_text_data_percentage(
                float(options.percentage_text_data))
        
        if options.modify_percentage:
            self._bd.set_modify_percentage(float(options.modify_percentage))

        if options.text_file_size:
            self._bd.set_text_file_size(
                self.parse_size(options.text_file_size))

        if options.binary_file_size:
            self._bd.set_binary_file_size(
                self.parse_size(options.binary_file_size))

        if options.create:
            options.create = self.parse_size(options.create, 
                                        self._bd.get_preexisting_data_size())

        if options.modify:
            options.modify = self.parse_size(options.modify, 
                                        self._bd.get_preexisting_data_size())

        if options.delete:
            options.delete = self.parse_count(options.delete, 
                                        self._bd.get_preexisting_file_count())

        if options.rename:
            options.rename = self.parse_count(options.rename, 
                                        self._bd.get_preexisting_file_count())

        if options.link:
            options.link = self.parse_count(options.link, 
                                        self._bd.get_preexisting_file_count())

        return options, args


class AppException(Exception):

    def __str__(self):
        return self._str
        
        
class NeedExactlyOneDirectoryName(AppException):

    def __init__(self):
        self._str = ("Need exactly one command line argument, "
                     "giving directory name")


class Application:

    """The main program"""
    
    def __init__(self, args):
        self._args = args
        self._bd = BackupData()
        self._clp = CommandLineParser(self._bd)
        self._error = sys.stderr.write

    def set_error_writer(self, writer):
        self._error = writer

    def run(self):
        """Execute the desired operations"""
        try:
            options, args = self._clp.parse(self._args)

            if len(args) != 1:
                raise NeedExactlyOneDirectoryName()
            
            if options.delete:
                self._bd.delete_files(options.delete)
            
            if options.rename:
                self._bd.rename_files(options.rename)
            
            if options.link:
                self._bd.link_files(options.link)
            
            if options.modify:
                self._bd.modify_files(options.modify)
            
            if options.create:
                self._bd.create_files(options.create)

        except AppException, e:
            self._error(str(e) + "\n")
            sys.exit(1)


if __name__ == "__main__": #pragma: no cover
    Application(sys.argv[1:]).run()