summaryrefslogtreecommitdiff
path: root/analyze-repository-files
blob: 90a9771693a6f2a8ebbd1946924d27ea0c207c6d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python
# Copyright 2011  Lars Wirzenius
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


'''Analyze the files in an Obnam backup repository.

For performance reasons, it is best if Obnam does not write too many
files per directory, or too large or too small files. This program
analyzes all the files in an Obnam backup repository, or, indeed, any
local directory, and reports the following:

* total number of files
* sum of lengths of files
* number of files per directory: fewest, most, average, median
  (both number and name of directory)
* size of files: smallest, largest, average, median
  (both size and name of file)

'''


import os
import stat
import sys


class Stats(object):

    def __init__(self):
        self.dirs = list()
        self.files = list()
        
    def add_dir(self, dirname, count):
        self.dirs.append((count, dirname))
        
    def add_file(self, filename, size):
        self.files.append((size, filename))
        
    @property
    def total_files(self):
        return len(self.files)
        
    @property
    def sum_of_sizes(self):
        return sum(size for size, name in self.files)
        
    @property
    def dirsizes(self):
        self.dirs.sort()
        num_dirs = len(self.dirs)
        
        fewest, fewest_name = self.dirs[0]
        most, most_name = self.dirs[-1]
        average = sum(count for count, name in self.dirs) / num_dirs
        median = self.dirs[num_dirs/2][0]

        return fewest, fewest_name, most, most_name, average, median
        
    @property
    def filesizes(self):
        self.files.sort()
        num_files = len(self.files)
        
        smallest, smallest_name = self.files[0]
        largest, largest_name = self.files[-1]
        average = sum(size for size, name in self.files) / num_files
        median = self.files[num_files/2][0]

        return smallest, smallest_name, largest, largest_name, average, median


def main():
    stats = Stats()
    for name in sys.argv[1:]:
        stat_info = os.lstat(name)
        if stat.S_ISDIR(stat_info.st_mode):
            for dirname, subdirs, filenames in os.walk(name):
                stats.add_dir(dirname, len(filenames) + len(subdirs))
                for filename in filenames:
                    pathname = os.path.join(dirname, filename)
                    stat_info = os.lstat(pathname)
                    if stat.S_ISREG(stat_info.st_mode):
                        stats.add_file(pathname, stat_info.st_size)
        elif stat.S_ISREG(stat_info.st_mode):
            stats.add_file(name, stat_info.st_size)
    
    print "total_files:", stats.total_files
    print "sum of sizes:", stats.sum_of_sizes

    fewest, fewest_name, most, most_name, average, median = stats.dirsizes
    print "files per dir:"
    print "  fewest:", fewest, fewest_name
    print "  most:", most, most_name
    print "  average:", average
    print "  median:", median

    smallest, smallest_name, largest, largest_name, average, median = \
        stats.filesizes
    print "file sizes:"
    print "  smallest:", smallest, smallest_name
    print "  largest:", largest, largest_name
    print "  average:", average
    print "  median:", median


if __name__ == '__main__':
    main()