diff options
author | Lars Wirzenius <liw@liw.fi> | 2010-04-18 10:13:08 +1200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2010-04-18 10:13:08 +1200 |
commit | 93dd590836516f560fcdc476f68670467d5db2bf (patch) | |
tree | b8875719553cb1c88b87aa14fda8eb2f6de0da77 /dupfiles | |
parent | 25f74b1cd85ea624e6dd588b3ba19f4738d5c2d4 (diff) | |
download | dupfiles-93dd590836516f560fcdc476f68670467d5db2bf.tar.gz |
Do not compute checksums when only one file has a given size.
Diffstat (limited to 'dupfiles')
-rwxr-xr-x | dupfiles | 33 |
1 files changed, 20 insertions, 13 deletions
@@ -81,22 +81,29 @@ class DuplicateFileFinder(object): done_files = 0 result = [] for tuples in self.by_size.itervalues(): - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) - done_files += len(names) - self.progress.write('%d/%d (%.1f%%) files done' % - (done_files, total_files, - 100.0 * float(done_files) / total_files)) + if len(tuples) > 1: + by_checksum = dict() + for dev, ino, pathname in tuples: + checksum = self.file_checksum(pathname) + if checksum not in by_checksum: + by_checksum[checksum] = set() + by_checksum[checksum].add(pathname) + done_files += 1 + self.duplicates_progress(done_files, total_files) + for names in by_checksum.itervalues(): + if len(names) > 1: + result.append(names) + else: + done_files += 1 + self.duplicates_progress(done_files, total_files) self.progress.finished() return result + def duplicates_progress(self, done_files, total_files): + self.progress.write('%d/%d (%.1f%%) files done' % + (done_files, total_files, + 100.0 * float(done_files) / total_files)) + def file_checksum(self, pathname): return hashlib.md5(file(pathname, 'rb').read()).digest() |