From 7feaddceb111be033d0f773621c3bb8b13cf0fed Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 18 Apr 2010 14:21:56 +1200 Subject: Improve progress reporting. --- dupfiles | 54 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/dupfiles b/dupfiles index cc1ff47..93287f5 100755 --- a/dupfiles +++ b/dupfiles @@ -77,32 +77,42 @@ class DuplicateFileFinder(object): # groups. If a group becomes just a single file, that file is not # a duplicate and can be dropped completely. def duplicates(self): - total_files = sum(len(x) for x in self.by_size.itervalues()) - done_files = 0 + skip = [size for size in self.by_size if len(self.by_size[size]) == 1] + for size in skip: + del self.by_size[size] + + total_bytes = sum(len(tuples) * size + for size, tuples in self.by_size.iteritems()) + done_bytes = 0 result = [] - for tuples in self.by_size.itervalues(): - if len(tuples) > 1: - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - done_files += 1 - self.duplicates_progress(done_files, total_files) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) - else: - done_files += 1 - self.duplicates_progress(done_files, total_files) + for size, tuples in self.by_size.iteritems(): + by_checksum = dict() + for dev, ino, pathname in tuples: + checksum = self.file_checksum(pathname) + if checksum not in by_checksum: + by_checksum[checksum] = set() + by_checksum[checksum].add(pathname) + done_bytes += size + self.duplicates_progress(done_bytes, total_bytes) + for names in by_checksum.itervalues(): + if len(names) > 1: + result.append(names) self.progress.finished() return result - def duplicates_progress(self, done_files, total_files): - self.progress.write('%d/%d (%.1f%%) files done' % - (done_files, total_files, - 100.0 * float(done_files) / total_files)) + def duplicates_progress(self, done, total): + self.progress.write('%s/%s (%.1f%%) bytes done' % + (self.human_size(done), self.human_size(total), + 100.0 * float(done) / float(total))) + + def human_size(self, size): + tab = [(1024**3, 'GiB'), + (1024**2, 'MiB'), + (1024**1, 'KiB')] + for limit, unit in tab: + if size >= limit: + return '%.1f %s' % (float(size) / float(limit), unit) + return '0 B' def file_checksum(self, pathname): return hashlib.md5(file(pathname, 'rb').read()).digest() -- cgit v1.2.1