From 93dd590836516f560fcdc476f68670467d5db2bf Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 18 Apr 2010 10:13:08 +1200 Subject: Do not compute checksums when only one file has a given size. --- dupfiles | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/dupfiles b/dupfiles index be36f0c..cc1ff47 100755 --- a/dupfiles +++ b/dupfiles @@ -81,22 +81,29 @@ class DuplicateFileFinder(object): done_files = 0 result = [] for tuples in self.by_size.itervalues(): - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) - done_files += len(names) - self.progress.write('%d/%d (%.1f%%) files done' % - (done_files, total_files, - 100.0 * float(done_files) / total_files)) + if len(tuples) > 1: + by_checksum = dict() + for dev, ino, pathname in tuples: + checksum = self.file_checksum(pathname) + if checksum not in by_checksum: + by_checksum[checksum] = set() + by_checksum[checksum].add(pathname) + done_files += 1 + self.duplicates_progress(done_files, total_files) + for names in by_checksum.itervalues(): + if len(names) > 1: + result.append(names) + else: + done_files += 1 + self.duplicates_progress(done_files, total_files) self.progress.finished() return result + def duplicates_progress(self, done_files, total_files): + self.progress.write('%d/%d (%.1f%%) files done' % + (done_files, total_files, + 100.0 * float(done_files) / total_files)) + def file_checksum(self, pathname): return hashlib.md5(file(pathname, 'rb').read()).digest() -- cgit v1.2.1