summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xdupfiles17
1 files changed, 16 insertions, 1 deletions
diff --git a/dupfiles b/dupfiles
index 039ecdd..31d485b 100755
--- a/dupfiles
+++ b/dupfiles
@@ -67,7 +67,16 @@ class DuplicateFileFinder(object):
set([pathname]))
self.progress.finished()
+ # FIXME: This computed the checksum for each file, the full file.
+ # It might be faster to read all files (of the same size) in parallel,
+ # in suitably small chunks, and compute checksums for all of them in
+ # parallel. When checksums diverge, split the files into smaller
+ # groups. If a group becomes just a single file, that file is not
+ # a duplicate and can be dropped completely.
def duplicates(self):
+ total_files = sum(len(x[2]) for x in self.by_size.itervalues())
+ done_files = 0
+ result = []
for dev, ino, pathnames in self.by_size.itervalues():
by_checksum = dict()
for pathname in pathnames:
@@ -77,7 +86,13 @@ class DuplicateFileFinder(object):
by_checksum[checksum].add(pathname)
for names in by_checksum.itervalues():
if len(names) > 1:
- yield names
+ result.append(names)
+ done_files += len(names)
+ self.progress.write('%d/%d (%.1f%%) files done' %
+ (done_files, total_files,
+ 100.0 * float(done_files) / total_files))
+ self.progress.finished()
+ return result
def file_checksum(self, pathname):
return hashlib.md5(file(pathname, 'rb').read()).digest()