diff options
author | Lars Wirzenius <liw@liw.fi> | 2010-04-18 14:21:56 +1200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2010-04-18 14:21:56 +1200 |
commit | 7feaddceb111be033d0f773621c3bb8b13cf0fed (patch) | |
tree | 87b9379c989a1bd2cff07a8b77a42b8212d30e45 | |
parent | 93dd590836516f560fcdc476f68670467d5db2bf (diff) | |
download | dupfiles-7feaddceb111be033d0f773621c3bb8b13cf0fed.tar.gz |
Improve progress reporting.
-rwxr-xr-x | dupfiles | 54 |
1 files changed, 32 insertions, 22 deletions
@@ -77,32 +77,42 @@ class DuplicateFileFinder(object): # groups. If a group becomes just a single file, that file is not # a duplicate and can be dropped completely. def duplicates(self): - total_files = sum(len(x) for x in self.by_size.itervalues()) - done_files = 0 + skip = [size for size in self.by_size if len(self.by_size[size]) == 1] + for size in skip: + del self.by_size[size] + + total_bytes = sum(len(tuples) * size + for size, tuples in self.by_size.iteritems()) + done_bytes = 0 result = [] - for tuples in self.by_size.itervalues(): - if len(tuples) > 1: - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - done_files += 1 - self.duplicates_progress(done_files, total_files) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) - else: - done_files += 1 - self.duplicates_progress(done_files, total_files) + for size, tuples in self.by_size.iteritems(): + by_checksum = dict() + for dev, ino, pathname in tuples: + checksum = self.file_checksum(pathname) + if checksum not in by_checksum: + by_checksum[checksum] = set() + by_checksum[checksum].add(pathname) + done_bytes += size + self.duplicates_progress(done_bytes, total_bytes) + for names in by_checksum.itervalues(): + if len(names) > 1: + result.append(names) self.progress.finished() return result - def duplicates_progress(self, done_files, total_files): - self.progress.write('%d/%d (%.1f%%) files done' % - (done_files, total_files, - 100.0 * float(done_files) / total_files)) + def duplicates_progress(self, done, total): + self.progress.write('%s/%s (%.1f%%) bytes done' % + (self.human_size(done), self.human_size(total), + 100.0 * float(done) / float(total))) + + def human_size(self, size): + tab = [(1024**3, 'GiB'), + (1024**2, 'MiB'), + (1024**1, 'KiB')] + for limit, unit in tab: + if size >= limit: + return '%.1f %s' % (float(size) / float(limit), unit) + return '0 B' def file_checksum(self, pathname): return hashlib.md5(file(pathname, 'rb').read()).digest() |