From 88cbe745cd55eca07407b6a7798fafaa4dab225c Mon Sep 17 00:00:00 2001
From: Lars Wirzenius <liw@liw.fi>
Date: Fri, 30 Apr 2010 19:31:17 +1200
Subject: Use progressbar library instead custom code.

---
 dupfiles | 147 ++++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 80 insertions(+), 67 deletions(-)

diff --git a/dupfiles b/dupfiles
index 3822eaa..b1de7fc 100755
--- a/dupfiles
+++ b/dupfiles
@@ -17,34 +17,71 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
+import errno
 import hashlib
 import optparse
 import os
+import progressbar
+import random
 import stat
 import sys
 import time
 
 
-class ProgressReporter(object):
+class ProgressBarValue(progressbar.ProgressBarWidget):
 
-    def __init__(self, do_report):
-        self.written = ''
-        self.when = 0
-        self.do_report = do_report
-        
-    def write(self, msg):
-        if self.do_report and time.time() - self.when >= 1:
-            sys.stdout.flush()
-            sys.stderr.write('\b \b' * len(self.written))
-            msg = msg[:79] # FIXME: use real screen width
-            sys.stderr.write(msg)
-            sys.stderr.flush()
-            self.written = msg
-            self.when = time.time()
+    def update(self, pbar):
+        return '%s' % pbar.currval
+
+
+class ProgressBarMaxval(progressbar.ProgressBarWidget):
+
+    def update(self, pbar):
+        return '%s' % pbar.maxval
+
+
+class FileStats(object):
 
-    def finished(self):
-        self.when = 0
-        self.write('')
+    def __init__(self):
+        self.open_count = 0
+        self.close_count = 0
+        self.hit_count = 0
+
+filestats = FileStats()
+filepool = dict()
+
+class File(object):
+
+    def __init__(self, pathname):
+        self.pathname = pathname
+        self.offset = 0
+
+    def read(self, num_bytes):
+        if self.pathname in filepool:
+            f = filepool[self.pathname]
+            filestats.hit_count += 1
+        else:
+            try:
+                f = file(self.pathname)
+            except IOError, e:
+                if e.errno != errno.EMFILE:
+                    raise
+                victim = random.choice(filepool.keys())
+                filepool[victim].close()
+                del filepool[victim]
+                filestats.close_count += 1
+                f = file(self.pathname)
+            f.seek(self.offset)
+            filepool[self.pathname] = f
+            filestats.open_count += 1
+        data = f.read(num_bytes)
+        self.offset += len(data)
+        return data
+
+    def close(self):
+        if self.pathname in filepool:
+            filepool[self.pathname].close()
+            del filepool[self.pathname]
 
 
 class DuplicateFileFinder(object):
@@ -52,10 +89,12 @@ class DuplicateFileFinder(object):
     def __init__(self, progress):
         self.by_size = dict()
         self.progress = progress
-        
+
     def collect(self, root):
+        if self.progress:
+            sys.stderr.write('Scanning %s\n' % root)
+
         for dirname, subdirs, filenames in os.walk(root):
-            self.progress.write(dirname)
             subdirs.sort()
             filenames.sort()
             pathnames = [os.path.join(dirname, f) for f in filenames]
@@ -67,7 +106,6 @@ class DuplicateFileFinder(object):
                         self.by_size[st.st_size].append(t)
                     else:
                         self.by_size[st.st_size] = [t]
-        self.progress.finished()
 
     def duplicates(self):
         skip = [size for size in self.by_size if len(self.by_size[size]) == 1]
@@ -75,21 +113,33 @@ class DuplicateFileFinder(object):
             del self.by_size[size]
 
         total_bytes = sum(len(tuples) * size
-                          for size, tuples in self.by_size.iteritems())
-        done_bytes = 0
-        start_time = time.time()
+                            for size, tuples in self.by_size.iteritems())
+        if self.progress:
+            widgets = [
+                progressbar.FileTransferSpeed(), ' ',
+                progressbar.Percentage(), ' ',
+                progressbar.Bar(), ' ', 
+                progressbar.ETA(),
+            ]
+            pbar = progressbar.ProgressBar(maxval=total_bytes, widgets=widgets)
+            pbar.start()
 
         result = []
-        for size, tuples in self.by_size.iteritems():
+        done_bytes = 0
+        for size, tuples in sorted(self.by_size.iteritems()):
             if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1:
                 # All duplicates are hardlinks to the same inode. Skip.
                 done_bytes += len(tuples) * size
             else:
                 new_dups = self.find_duplicates([p for d, i, p in tuples])
                 result += new_dups
-                done_bytes += len(new_dups) * size
-            self.duplicates_progress(done_bytes, total_bytes, start_time)
-        self.progress.finished()
+                done_bytes += len(tuples) * size
+
+            if self.progress:
+                pbar.update(done_bytes)
+
+        if self.progress:
+            pbar.finish()
         return result
 
     def find_duplicates(self, pathnames):
@@ -100,15 +150,8 @@ class DuplicateFileFinder(object):
         
         '''
         
-        # FIXME: This assumes it can open every file at the same time.
-        # If there are a lot of files, that's not going to be possible.
-        # This might work: keep a pool of open files, and record how
-        # far you got with each file. Then close and re-open files as
-        # necessary, if the pool is too small. When re-opening, seek
-        # to the remembered position.
-
         result = []
-        identical_groups = [[(x, file(x)) for x in pathnames]]
+        identical_groups = [[(x, File(x)) for x in pathnames]]
 
         while identical_groups:
             new = []
@@ -151,35 +194,6 @@ class DuplicateFileFinder(object):
                 not_done.append((pathname, f, data))
         return done, not_done
 
-    def duplicates_progress(self, done, total, started):
-        duration = time.time() - started
-        self.progress.write('%s/%s (%.1f%%) done (%s)' %
-                            (self.human_size(done), self.human_size(total), 
-                             100.0 * float(done) / float(total),
-                             self.human_duration(duration)))
-
-    def human_size(self, size):
-        tab = [(1024**3, 'GiB'),
-               (1024**2, 'MiB'),
-               (1024**1, 'KiB')]
-        for limit, unit in tab:
-            if size >= limit:
-                return '%.1f %s' % (float(size) / float(limit), unit)
-        return '0 B'
-
-    def human_duration(self, duration):
-        units = [(3600, 'h'),
-                 (60, 'min'),
-                 (1, 's')]
-
-        parts = []
-        for limit, unit in units:
-            count = int(duration) / limit
-            duration %= limit
-            if count > 0:
-                parts.append('%d %s' % (count, unit))
-        return ' '.join(parts or ['0 s'])
-
 
 def make_hardlinks(duplicates):
     canonical = duplicates.pop()
@@ -202,8 +216,7 @@ def main():
 
     opts, args = parser.parse_args()
 
-    progress = ProgressReporter(opts.progress)
-    dupfinder = DuplicateFileFinder(progress)
+    dupfinder = DuplicateFileFinder(opts.progress)
     for dirname in sorted(args):
         dupfinder.collect(dirname)
 
-- 
cgit v1.2.1