From 21e14e7ed7daeb4abf065b200256db765f133eb6 Mon Sep 17 00:00:00 2001
From: Lars Wirzenius <liw@liw.fi>
Date: Sun, 18 Apr 2010 08:55:25 +1200
Subject: Add some progress reporting.

Add FIXME comment with idea for optimization. I am not going
to implement it right now, since right now I can afford to wait
for the script (it runs in the background, while I do other
things).
---
 dupfiles | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/dupfiles b/dupfiles
index 039ecdd..31d485b 100755
--- a/dupfiles
+++ b/dupfiles
@@ -67,7 +67,16 @@ class DuplicateFileFinder(object):
                                                   set([pathname]))
         self.progress.finished()
 
+    # FIXME: This computed the checksum for each file, the full file.
+    # It might be faster to read all files (of the same size) in parallel,
+    # in suitably small chunks, and compute checksums for all of them in
+    # parallel. When checksums diverge, split the files into smaller
+    # groups. If a group becomes just a single file, that file is not
+    # a duplicate and can be dropped completely.
     def duplicates(self):
+        total_files = sum(len(x[2]) for x in self.by_size.itervalues())
+        done_files = 0
+        result = []
         for dev, ino, pathnames in self.by_size.itervalues():
             by_checksum = dict()
             for pathname in pathnames:
@@ -77,7 +86,13 @@ class DuplicateFileFinder(object):
                 by_checksum[checksum].add(pathname)
             for names in by_checksum.itervalues():
                 if len(names) > 1:
-                    yield names
+                    result.append(names)
+                done_files += len(names)
+                self.progress.write('%d/%d (%.1f%%) files done' %
+                                    (done_files, total_files, 
+                                     100.0 * float(done_files) / total_files))
+        self.progress.finished()
+        return result
 
     def file_checksum(self, pathname):
         return hashlib.md5(file(pathname, 'rb').read()).digest()
-- 
cgit v1.2.1