summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--debian/changelog19
-rw-r--r--debian/control3
-rwxr-xr-xdupfiles83
-rw-r--r--dupfiles.16
4 files changed, 70 insertions, 41 deletions
diff --git a/debian/changelog b/debian/changelog
index 87d1549..76a87d2 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,22 @@
+dupfiles (5.1) squeeze; urgency=low
+
+ * Add --remove option.
+
+ -- Lars Wirzenius <liw@liw.fi> Wed, 22 Dec 2010 09:04:17 +0000
+
+dupfiles (5) squeeze; urgency=low
+
+ * Use python-ttystatus for progress reporting.
+
+ -- Lars Wirzenius <liw@liw.fi> Sat, 09 Oct 2010 21:01:52 +0100
+
+dupfiles (4) squeeze; urgency=low
+
+ * Add dependency on python-progressbar.
+ * Only look at regular files, ignore symlinks etc.
+
+ -- Lars Wirzenius <liw@liw.fi> Sat, 01 May 2010 06:03:14 +1200
+
dupfiles (3) squeeze; urgency=low
* Progress reporting fix.
diff --git a/debian/control b/debian/control
index e2b592d..a951e5c 100644
--- a/debian/control
+++ b/debian/control
@@ -7,7 +7,8 @@ Build-Depends: debhelper (>= 7.3.8), python-support (>= 1.0.3), python (>= 2.5)
Package: dupfiles
Architecture: all
-Depends: ${python:Depends}, ${misc:Depends}, python (>= 2.5)
+Depends: ${python:Depends}, ${misc:Depends}, python (>= 2.5),
+ python-progressbar
Description: find and optionally hard-link duplicate files
dupfiles finds files with identical content, and either reports them,
or hardlinks them to each other.
diff --git a/dupfiles b/dupfiles
index 7c8af79..78e4b5f 100755
--- a/dupfiles
+++ b/dupfiles
@@ -21,23 +21,11 @@ import errno
import hashlib
import optparse
import os
-import progressbar
import random
import stat
import sys
import time
-
-
-class ProgressBarValue(progressbar.ProgressBarWidget):
-
- def update(self, pbar):
- return '%s' % pbar.currval
-
-
-class ProgressBarMaxval(progressbar.ProgressBarWidget):
-
- def update(self, pbar):
- return '%s' % pbar.maxval
+import ttystatus
class FileStats(object):
@@ -92,10 +80,13 @@ class DuplicateFileFinder(object):
self.progress = progress
def collect(self, root):
+ ts = ttystatus.TerminalStatus()
if self.progress:
- sys.stderr.write('Scanning %s\n' % root)
+ ts.add(ttystatus.Literal('Scanning '))
+ ts.add(ttystatus.Pathname('dirname'))
for dirname, subdirs, filenames in os.walk(root):
+ ts['dirname'] = dirname
subdirs.sort()
filenames.sort()
pathnames = [os.path.join(dirname, f) for f in filenames]
@@ -107,40 +98,44 @@ class DuplicateFileFinder(object):
self.by_size[st.st_size].append(t)
else:
self.by_size[st.st_size] = [t]
+ ts.clear()
def duplicates(self):
- skip = [size for size in self.by_size if len(self.by_size[size]) == 1]
- for size in skip:
- del self.by_size[size]
-
total_bytes = sum(len(tuples) * size
- for size, tuples in self.by_size.iteritems())
+ for size, tuples in self.by_size.iteritems())
+
+ ts = ttystatus.TerminalStatus(period=0.5)
+ ts['done'] = 0
if self.progress:
- widgets = [
- progressbar.FileTransferSpeed(), ' ',
- progressbar.Percentage(), ' ',
- progressbar.Bar(), ' ',
- progressbar.ETA(),
- ]
- pbar = progressbar.ProgressBar(maxval=total_bytes, widgets=widgets)
- pbar.start()
+ ts.add(ttystatus.Literal('Comparing: '))
+ ts.add(ttystatus.ByteSize('done'))
+ ts.add(ttystatus.Literal('/'))
+ ts.add(ttystatus.ByteSize('total'))
+ ts.add(ttystatus.Literal(' ('))
+ ts.add(ttystatus.PercentDone('done', 'total'))
+ ts.add(ttystatus.Literal('), group '))
+ ts.add(ttystatus.Counter('size'))
+ ts.add(ttystatus.Literal('/'))
+ ts.add(ttystatus.Literal(str(len(self.by_size))))
+ ts.add(ttystatus.Literal(' ('))
+ ts.add(ttystatus.ByteSize('size'))
+ ts.add(ttystatus.Literal(')'))
result = []
- done_bytes = 0
+ ith = 0
for size, tuples in sorted(self.by_size.iteritems()):
- if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1:
- # All duplicates are hardlinks to the same inode. Skip.
- done_bytes += len(tuples) * size
- else:
- new_dups = self.find_duplicates([p for d, i, p in tuples])
- result += new_dups
- done_bytes += len(tuples) * size
-
- if self.progress:
- pbar.update(done_bytes)
+ ith += 1
+ if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1:
+ # All files are not hardlinks to the same inode.
+ # (This also excludes groups with just one file.)
+ result += self.find_duplicates([p for d, i, p in tuples])
+ ts['size'] = size
+ ts['done'] += len(tuples) * size
+ ts['total'] = total_bytes
if self.progress:
- pbar.finish()
+ ts.finish()
+
return result
def find_duplicates(self, pathnames):
@@ -203,6 +198,12 @@ def make_hardlinks(duplicates):
os.link(canonical, pathname)
+def remove_all_but_one(duplicates):
+ keep = duplicates.pop()
+ for pathname in duplicates:
+ os.remove(pathname)
+
+
def report(duplicates):
sys.stdout.write('\n'.join(duplicates))
sys.stdout.write('\n\n')
@@ -214,6 +215,8 @@ def main():
help='hardlink duplicate files to each other')
parser.add_option('--progress', action='store_true',
help='report progress')
+ parser.add_option('--remove', action='store_true',
+ help='remove all but one copy of identical files')
opts, args = parser.parse_args()
@@ -225,6 +228,8 @@ def main():
for duplicates in dupfinder.duplicates():
if opts.make_hardlinks:
make_hardlinks(duplicates)
+ elif opts.remove:
+ remove_all_but_one(duplicates)
else:
report(duplicates)
diff --git a/dupfiles.1 b/dupfiles.1
index 1bcf159..ee66d9b 100644
--- a/dupfiles.1
+++ b/dupfiles.1
@@ -20,6 +20,7 @@ dupfiles \- find identical files and optionally hardlink them
.B dupfiles
.RB [ --make-hardlinks ]
.RB [ --progress ]
+.RB [ --remove ]
.RI [ file ]...
.SH DESCRIPTION
.B dupfiles
@@ -43,7 +44,10 @@ Report progress while program is running.
.BR --make-hardlinks
Instead of reporting files that have been found,
make sure all names point at the same content.
-FIXME.
+.TP
+.BR --remove
+Remove all but one copy of identical files.
+The copy that gets kept is chosen arbitrarily.
.SH EXAMPLE
To find all duplicate packages in Debian and Ubuntu:
.IP