summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-18 10:13:08 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-18 10:13:08 +1200
commit93dd590836516f560fcdc476f68670467d5db2bf (patch)
treeb8875719553cb1c88b87aa14fda8eb2f6de0da77
parent25f74b1cd85ea624e6dd588b3ba19f4738d5c2d4 (diff)
downloaddupfiles-93dd590836516f560fcdc476f68670467d5db2bf.tar.gz
Do not compute checksums when only one file has a given size.
-rwxr-xr-xdupfiles33
1 files changed, 20 insertions, 13 deletions
diff --git a/dupfiles b/dupfiles
index be36f0c..cc1ff47 100755
--- a/dupfiles
+++ b/dupfiles
@@ -81,22 +81,29 @@ class DuplicateFileFinder(object):
done_files = 0
result = []
for tuples in self.by_size.itervalues():
- by_checksum = dict()
- for dev, ino, pathname in tuples:
- checksum = self.file_checksum(pathname)
- if checksum not in by_checksum:
- by_checksum[checksum] = set()
- by_checksum[checksum].add(pathname)
- for names in by_checksum.itervalues():
- if len(names) > 1:
- result.append(names)
- done_files += len(names)
- self.progress.write('%d/%d (%.1f%%) files done' %
- (done_files, total_files,
- 100.0 * float(done_files) / total_files))
+ if len(tuples) > 1:
+ by_checksum = dict()
+ for dev, ino, pathname in tuples:
+ checksum = self.file_checksum(pathname)
+ if checksum not in by_checksum:
+ by_checksum[checksum] = set()
+ by_checksum[checksum].add(pathname)
+ done_files += 1
+ self.duplicates_progress(done_files, total_files)
+ for names in by_checksum.itervalues():
+ if len(names) > 1:
+ result.append(names)
+ else:
+ done_files += 1
+ self.duplicates_progress(done_files, total_files)
self.progress.finished()
return result
+ def duplicates_progress(self, done_files, total_files):
+ self.progress.write('%d/%d (%.1f%%) files done' %
+ (done_files, total_files,
+ 100.0 * float(done_files) / total_files))
+
def file_checksum(self, pathname):
return hashlib.md5(file(pathname, 'rb').read()).digest()