diff options
author | Lars Wirzenius <liw@liw.fi> | 2010-04-18 08:55:25 +1200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2010-04-18 08:55:25 +1200 |
commit | 21e14e7ed7daeb4abf065b200256db765f133eb6 (patch) | |
tree | ffbca1d7e17188203771f8acd6e7390f1fe75538 | |
parent | c67d8ec2f2ca1523c4782287c40cd1a5e5953a4b (diff) | |
download | dupfiles-21e14e7ed7daeb4abf065b200256db765f133eb6.tar.gz |
Add some progress reporting.
Add FIXME comment with idea for optimization. I am not going
to implement it right now, since right now I can afford to wait
for the script (it runs in the background, while I do other
things).
-rwxr-xr-x | dupfiles | 17 |
1 files changed, 16 insertions, 1 deletions
@@ -67,7 +67,16 @@ class DuplicateFileFinder(object): set([pathname])) self.progress.finished() + # FIXME: This computed the checksum for each file, the full file. + # It might be faster to read all files (of the same size) in parallel, + # in suitably small chunks, and compute checksums for all of them in + # parallel. When checksums diverge, split the files into smaller + # groups. If a group becomes just a single file, that file is not + # a duplicate and can be dropped completely. def duplicates(self): + total_files = sum(len(x[2]) for x in self.by_size.itervalues()) + done_files = 0 + result = [] for dev, ino, pathnames in self.by_size.itervalues(): by_checksum = dict() for pathname in pathnames: @@ -77,7 +86,13 @@ class DuplicateFileFinder(object): by_checksum[checksum].add(pathname) for names in by_checksum.itervalues(): if len(names) > 1: - yield names + result.append(names) + done_files += len(names) + self.progress.write('%d/%d (%.1f%%) files done' % + (done_files, total_files, + 100.0 * float(done_files) / total_files)) + self.progress.finished() + return result def file_checksum(self, pathname): return hashlib.md5(file(pathname, 'rb').read()).digest() |