diff options
-rw-r--r-- | debian/changelog | 19 | ||||
-rw-r--r-- | debian/control | 3 | ||||
-rwxr-xr-x | dupfiles | 83 | ||||
-rw-r--r-- | dupfiles.1 | 6 |
4 files changed, 70 insertions, 41 deletions
diff --git a/debian/changelog b/debian/changelog index 87d1549..76a87d2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,22 @@ +dupfiles (5.1) squeeze; urgency=low + + * Add --remove option. + + -- Lars Wirzenius <liw@liw.fi> Wed, 22 Dec 2010 09:04:17 +0000 + +dupfiles (5) squeeze; urgency=low + + * Use python-ttystatus for progress reporting. + + -- Lars Wirzenius <liw@liw.fi> Sat, 09 Oct 2010 21:01:52 +0100 + +dupfiles (4) squeeze; urgency=low + + * Add dependency on python-progressbar. + * Only look at regular files, ignore symlinks etc. + + -- Lars Wirzenius <liw@liw.fi> Sat, 01 May 2010 06:03:14 +1200 + dupfiles (3) squeeze; urgency=low * Progress reporting fix. diff --git a/debian/control b/debian/control index e2b592d..a951e5c 100644 --- a/debian/control +++ b/debian/control @@ -7,7 +7,8 @@ Build-Depends: debhelper (>= 7.3.8), python-support (>= 1.0.3), python (>= 2.5) Package: dupfiles Architecture: all -Depends: ${python:Depends}, ${misc:Depends}, python (>= 2.5) +Depends: ${python:Depends}, ${misc:Depends}, python (>= 2.5), + python-progressbar Description: find and optionally hard-link duplicate files dupfiles finds files with identical content, and either reports them, or hardlinks them to each other. @@ -21,23 +21,11 @@ import errno import hashlib import optparse import os -import progressbar import random import stat import sys import time - - -class ProgressBarValue(progressbar.ProgressBarWidget): - - def update(self, pbar): - return '%s' % pbar.currval - - -class ProgressBarMaxval(progressbar.ProgressBarWidget): - - def update(self, pbar): - return '%s' % pbar.maxval +import ttystatus class FileStats(object): @@ -92,10 +80,13 @@ class DuplicateFileFinder(object): self.progress = progress def collect(self, root): + ts = ttystatus.TerminalStatus() if self.progress: - sys.stderr.write('Scanning %s\n' % root) + ts.add(ttystatus.Literal('Scanning ')) + ts.add(ttystatus.Pathname('dirname')) for dirname, subdirs, filenames in os.walk(root): + ts['dirname'] = dirname subdirs.sort() filenames.sort() pathnames = [os.path.join(dirname, f) for f in filenames] @@ -107,40 +98,44 @@ class DuplicateFileFinder(object): self.by_size[st.st_size].append(t) else: self.by_size[st.st_size] = [t] + ts.clear() def duplicates(self): - skip = [size for size in self.by_size if len(self.by_size[size]) == 1] - for size in skip: - del self.by_size[size] - total_bytes = sum(len(tuples) * size - for size, tuples in self.by_size.iteritems()) + for size, tuples in self.by_size.iteritems()) + + ts = ttystatus.TerminalStatus(period=0.5) + ts['done'] = 0 if self.progress: - widgets = [ - progressbar.FileTransferSpeed(), ' ', - progressbar.Percentage(), ' ', - progressbar.Bar(), ' ', - progressbar.ETA(), - ] - pbar = progressbar.ProgressBar(maxval=total_bytes, widgets=widgets) - pbar.start() + ts.add(ttystatus.Literal('Comparing: ')) + ts.add(ttystatus.ByteSize('done')) + ts.add(ttystatus.Literal('/')) + ts.add(ttystatus.ByteSize('total')) + ts.add(ttystatus.Literal(' (')) + ts.add(ttystatus.PercentDone('done', 'total')) + ts.add(ttystatus.Literal('), group ')) + ts.add(ttystatus.Counter('size')) + ts.add(ttystatus.Literal('/')) + ts.add(ttystatus.Literal(str(len(self.by_size)))) + ts.add(ttystatus.Literal(' (')) + ts.add(ttystatus.ByteSize('size')) + ts.add(ttystatus.Literal(')')) result = [] - done_bytes = 0 + ith = 0 for size, tuples in sorted(self.by_size.iteritems()): - if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1: - # All duplicates are hardlinks to the same inode. Skip. - done_bytes += len(tuples) * size - else: - new_dups = self.find_duplicates([p for d, i, p in tuples]) - result += new_dups - done_bytes += len(tuples) * size - - if self.progress: - pbar.update(done_bytes) + ith += 1 + if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1: + # All files are not hardlinks to the same inode. + # (This also excludes groups with just one file.) + result += self.find_duplicates([p for d, i, p in tuples]) + ts['size'] = size + ts['done'] += len(tuples) * size + ts['total'] = total_bytes if self.progress: - pbar.finish() + ts.finish() + return result def find_duplicates(self, pathnames): @@ -203,6 +198,12 @@ def make_hardlinks(duplicates): os.link(canonical, pathname) +def remove_all_but_one(duplicates): + keep = duplicates.pop() + for pathname in duplicates: + os.remove(pathname) + + def report(duplicates): sys.stdout.write('\n'.join(duplicates)) sys.stdout.write('\n\n') @@ -214,6 +215,8 @@ def main(): help='hardlink duplicate files to each other') parser.add_option('--progress', action='store_true', help='report progress') + parser.add_option('--remove', action='store_true', + help='remove all but one copy of identical files') opts, args = parser.parse_args() @@ -225,6 +228,8 @@ def main(): for duplicates in dupfinder.duplicates(): if opts.make_hardlinks: make_hardlinks(duplicates) + elif opts.remove: + remove_all_but_one(duplicates) else: report(duplicates) @@ -20,6 +20,7 @@ dupfiles \- find identical files and optionally hardlink them .B dupfiles .RB [ --make-hardlinks ] .RB [ --progress ] +.RB [ --remove ] .RI [ file ]... .SH DESCRIPTION .B dupfiles @@ -43,7 +44,10 @@ Report progress while program is running. .BR --make-hardlinks Instead of reporting files that have been found, make sure all names point at the same content. -FIXME. +.TP +.BR --remove +Remove all but one copy of identical files. +The copy that gets kept is chosen arbitrarily. .SH EXAMPLE To find all duplicate packages in Debian and Ubuntu: .IP |