summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-30 19:31:17 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-30 19:31:17 +1200
commit88cbe745cd55eca07407b6a7798fafaa4dab225c (patch)
tree69a58991aa8068706137b0f2bc60611f7c841b66
parent42bc05a46c3ecfcb8923317ee791c8cf5cab71fa (diff)
downloaddupfiles-88cbe745cd55eca07407b6a7798fafaa4dab225c.tar.gz
Use progressbar library instead custom code.
-rwxr-xr-xdupfiles147
1 files changed, 80 insertions, 67 deletions
diff --git a/dupfiles b/dupfiles
index 3822eaa..b1de7fc 100755
--- a/dupfiles
+++ b/dupfiles
@@ -17,34 +17,71 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
+import errno
import hashlib
import optparse
import os
+import progressbar
+import random
import stat
import sys
import time
-class ProgressReporter(object):
+class ProgressBarValue(progressbar.ProgressBarWidget):
- def __init__(self, do_report):
- self.written = ''
- self.when = 0
- self.do_report = do_report
-
- def write(self, msg):
- if self.do_report and time.time() - self.when >= 1:
- sys.stdout.flush()
- sys.stderr.write('\b \b' * len(self.written))
- msg = msg[:79] # FIXME: use real screen width
- sys.stderr.write(msg)
- sys.stderr.flush()
- self.written = msg
- self.when = time.time()
+ def update(self, pbar):
+ return '%s' % pbar.currval
+
+
+class ProgressBarMaxval(progressbar.ProgressBarWidget):
+
+ def update(self, pbar):
+ return '%s' % pbar.maxval
+
+
+class FileStats(object):
- def finished(self):
- self.when = 0
- self.write('')
+ def __init__(self):
+ self.open_count = 0
+ self.close_count = 0
+ self.hit_count = 0
+
+filestats = FileStats()
+filepool = dict()
+
+class File(object):
+
+ def __init__(self, pathname):
+ self.pathname = pathname
+ self.offset = 0
+
+ def read(self, num_bytes):
+ if self.pathname in filepool:
+ f = filepool[self.pathname]
+ filestats.hit_count += 1
+ else:
+ try:
+ f = file(self.pathname)
+ except IOError, e:
+ if e.errno != errno.EMFILE:
+ raise
+ victim = random.choice(filepool.keys())
+ filepool[victim].close()
+ del filepool[victim]
+ filestats.close_count += 1
+ f = file(self.pathname)
+ f.seek(self.offset)
+ filepool[self.pathname] = f
+ filestats.open_count += 1
+ data = f.read(num_bytes)
+ self.offset += len(data)
+ return data
+
+ def close(self):
+ if self.pathname in filepool:
+ filepool[self.pathname].close()
+ del filepool[self.pathname]
class DuplicateFileFinder(object):
@@ -52,10 +89,12 @@ class DuplicateFileFinder(object):
def __init__(self, progress):
self.by_size = dict()
self.progress = progress
-
+
def collect(self, root):
+ if self.progress:
+ sys.stderr.write('Scanning %s\n' % root)
+
for dirname, subdirs, filenames in os.walk(root):
- self.progress.write(dirname)
subdirs.sort()
filenames.sort()
pathnames = [os.path.join(dirname, f) for f in filenames]
@@ -67,7 +106,6 @@ class DuplicateFileFinder(object):
self.by_size[st.st_size].append(t)
else:
self.by_size[st.st_size] = [t]
- self.progress.finished()
def duplicates(self):
skip = [size for size in self.by_size if len(self.by_size[size]) == 1]
@@ -75,21 +113,33 @@ class DuplicateFileFinder(object):
del self.by_size[size]
total_bytes = sum(len(tuples) * size
- for size, tuples in self.by_size.iteritems())
- done_bytes = 0
- start_time = time.time()
+ for size, tuples in self.by_size.iteritems())
+ if self.progress:
+ widgets = [
+ progressbar.FileTransferSpeed(), ' ',
+ progressbar.Percentage(), ' ',
+ progressbar.Bar(), ' ',
+ progressbar.ETA(),
+ ]
+ pbar = progressbar.ProgressBar(maxval=total_bytes, widgets=widgets)
+ pbar.start()
result = []
- for size, tuples in self.by_size.iteritems():
+ done_bytes = 0
+ for size, tuples in sorted(self.by_size.iteritems()):
if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1:
# All duplicates are hardlinks to the same inode. Skip.
done_bytes += len(tuples) * size
else:
new_dups = self.find_duplicates([p for d, i, p in tuples])
result += new_dups
- done_bytes += len(new_dups) * size
- self.duplicates_progress(done_bytes, total_bytes, start_time)
- self.progress.finished()
+ done_bytes += len(tuples) * size
+
+ if self.progress:
+ pbar.update(done_bytes)
+
+ if self.progress:
+ pbar.finish()
return result
def find_duplicates(self, pathnames):
@@ -100,15 +150,8 @@ class DuplicateFileFinder(object):
'''
- # FIXME: This assumes it can open every file at the same time.
- # If there are a lot of files, that's not going to be possible.
- # This might work: keep a pool of open files, and record how
- # far you got with each file. Then close and re-open files as
- # necessary, if the pool is too small. When re-opening, seek
- # to the remembered position.
-
result = []
- identical_groups = [[(x, file(x)) for x in pathnames]]
+ identical_groups = [[(x, File(x)) for x in pathnames]]
while identical_groups:
new = []
@@ -151,35 +194,6 @@ class DuplicateFileFinder(object):
not_done.append((pathname, f, data))
return done, not_done
- def duplicates_progress(self, done, total, started):
- duration = time.time() - started
- self.progress.write('%s/%s (%.1f%%) done (%s)' %
- (self.human_size(done), self.human_size(total),
- 100.0 * float(done) / float(total),
- self.human_duration(duration)))
-
- def human_size(self, size):
- tab = [(1024**3, 'GiB'),
- (1024**2, 'MiB'),
- (1024**1, 'KiB')]
- for limit, unit in tab:
- if size >= limit:
- return '%.1f %s' % (float(size) / float(limit), unit)
- return '0 B'
-
- def human_duration(self, duration):
- units = [(3600, 'h'),
- (60, 'min'),
- (1, 's')]
-
- parts = []
- for limit, unit in units:
- count = int(duration) / limit
- duration %= limit
- if count > 0:
- parts.append('%d %s' % (count, unit))
- return ' '.join(parts or ['0 s'])
-
def make_hardlinks(duplicates):
canonical = duplicates.pop()
@@ -202,8 +216,7 @@ def main():
opts, args = parser.parse_args()
- progress = ProgressReporter(opts.progress)
- dupfinder = DuplicateFileFinder(progress)
+ dupfinder = DuplicateFileFinder(opts.progress)
for dirname in sorted(args):
dupfinder.collect(dirname)