From 88cbe745cd55eca07407b6a7798fafaa4dab225c Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Fri, 30 Apr 2010 19:31:17 +1200 Subject: Use progressbar library instead custom code. --- dupfiles | 147 ++++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/dupfiles b/dupfiles index 3822eaa..b1de7fc 100755 --- a/dupfiles +++ b/dupfiles @@ -17,34 +17,71 @@ # along with this program. If not, see . +import errno import hashlib import optparse import os +import progressbar +import random import stat import sys import time -class ProgressReporter(object): +class ProgressBarValue(progressbar.ProgressBarWidget): - def __init__(self, do_report): - self.written = '' - self.when = 0 - self.do_report = do_report - - def write(self, msg): - if self.do_report and time.time() - self.when >= 1: - sys.stdout.flush() - sys.stderr.write('\b \b' * len(self.written)) - msg = msg[:79] # FIXME: use real screen width - sys.stderr.write(msg) - sys.stderr.flush() - self.written = msg - self.when = time.time() + def update(self, pbar): + return '%s' % pbar.currval + + +class ProgressBarMaxval(progressbar.ProgressBarWidget): + + def update(self, pbar): + return '%s' % pbar.maxval + + +class FileStats(object): - def finished(self): - self.when = 0 - self.write('') + def __init__(self): + self.open_count = 0 + self.close_count = 0 + self.hit_count = 0 + +filestats = FileStats() +filepool = dict() + +class File(object): + + def __init__(self, pathname): + self.pathname = pathname + self.offset = 0 + + def read(self, num_bytes): + if self.pathname in filepool: + f = filepool[self.pathname] + filestats.hit_count += 1 + else: + try: + f = file(self.pathname) + except IOError, e: + if e.errno != errno.EMFILE: + raise + victim = random.choice(filepool.keys()) + filepool[victim].close() + del filepool[victim] + filestats.close_count += 1 + f = file(self.pathname) + f.seek(self.offset) + filepool[self.pathname] = f + filestats.open_count += 1 + data = f.read(num_bytes) + self.offset += len(data) + return data + + def close(self): + if self.pathname in filepool: + filepool[self.pathname].close() + del filepool[self.pathname] class DuplicateFileFinder(object): @@ -52,10 +89,12 @@ class DuplicateFileFinder(object): def __init__(self, progress): self.by_size = dict() self.progress = progress - + def collect(self, root): + if self.progress: + sys.stderr.write('Scanning %s\n' % root) + for dirname, subdirs, filenames in os.walk(root): - self.progress.write(dirname) subdirs.sort() filenames.sort() pathnames = [os.path.join(dirname, f) for f in filenames] @@ -67,7 +106,6 @@ class DuplicateFileFinder(object): self.by_size[st.st_size].append(t) else: self.by_size[st.st_size] = [t] - self.progress.finished() def duplicates(self): skip = [size for size in self.by_size if len(self.by_size[size]) == 1] @@ -75,21 +113,33 @@ class DuplicateFileFinder(object): del self.by_size[size] total_bytes = sum(len(tuples) * size - for size, tuples in self.by_size.iteritems()) - done_bytes = 0 - start_time = time.time() + for size, tuples in self.by_size.iteritems()) + if self.progress: + widgets = [ + progressbar.FileTransferSpeed(), ' ', + progressbar.Percentage(), ' ', + progressbar.Bar(), ' ', + progressbar.ETA(), + ] + pbar = progressbar.ProgressBar(maxval=total_bytes, widgets=widgets) + pbar.start() result = [] - for size, tuples in self.by_size.iteritems(): + done_bytes = 0 + for size, tuples in sorted(self.by_size.iteritems()): if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1: # All duplicates are hardlinks to the same inode. Skip. done_bytes += len(tuples) * size else: new_dups = self.find_duplicates([p for d, i, p in tuples]) result += new_dups - done_bytes += len(new_dups) * size - self.duplicates_progress(done_bytes, total_bytes, start_time) - self.progress.finished() + done_bytes += len(tuples) * size + + if self.progress: + pbar.update(done_bytes) + + if self.progress: + pbar.finish() return result def find_duplicates(self, pathnames): @@ -100,15 +150,8 @@ class DuplicateFileFinder(object): ''' - # FIXME: This assumes it can open every file at the same time. - # If there are a lot of files, that's not going to be possible. - # This might work: keep a pool of open files, and record how - # far you got with each file. Then close and re-open files as - # necessary, if the pool is too small. When re-opening, seek - # to the remembered position. - result = [] - identical_groups = [[(x, file(x)) for x in pathnames]] + identical_groups = [[(x, File(x)) for x in pathnames]] while identical_groups: new = [] @@ -151,35 +194,6 @@ class DuplicateFileFinder(object): not_done.append((pathname, f, data)) return done, not_done - def duplicates_progress(self, done, total, started): - duration = time.time() - started - self.progress.write('%s/%s (%.1f%%) done (%s)' % - (self.human_size(done), self.human_size(total), - 100.0 * float(done) / float(total), - self.human_duration(duration))) - - def human_size(self, size): - tab = [(1024**3, 'GiB'), - (1024**2, 'MiB'), - (1024**1, 'KiB')] - for limit, unit in tab: - if size >= limit: - return '%.1f %s' % (float(size) / float(limit), unit) - return '0 B' - - def human_duration(self, duration): - units = [(3600, 'h'), - (60, 'min'), - (1, 's')] - - parts = [] - for limit, unit in units: - count = int(duration) / limit - duration %= limit - if count > 0: - parts.append('%d %s' % (count, unit)) - return ' '.join(parts or ['0 s']) - def make_hardlinks(duplicates): canonical = duplicates.pop() @@ -202,8 +216,7 @@ def main(): opts, args = parser.parse_args() - progress = ProgressReporter(opts.progress) - dupfinder = DuplicateFileFinder(progress) + dupfinder = DuplicateFileFinder(opts.progress) for dirname in sorted(args): dupfinder.collect(dirname) -- cgit v1.2.1