summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-25 06:34:49 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-25 06:34:49 +1200
commit626248d5a81c7bdc76655da0aafa02ed3ae64186 (patch)
treef24238d645ac9f8a7cc684bf625bfde40f97d666
parentf8b8501cba8546ecad422a6a41dca944bb3e94a5 (diff)
downloaddupfiles-626248d5a81c7bdc76655da0aafa02ed3ae64186.tar.gz
Remove old FIXME, add new one.
-rwxr-xr-xdupfiles13
1 files changed, 7 insertions, 6 deletions
diff --git a/dupfiles b/dupfiles
index abcd67d..35742f1 100755
--- a/dupfiles
+++ b/dupfiles
@@ -66,12 +66,6 @@ class DuplicateFileFinder(object):
self.by_size[stat.st_size] = [t]
self.progress.finished()
- # FIXME: This computed the checksum for each file, the full file.
- # It might be faster to read all files (of the same size) in parallel,
- # in suitably small chunks, and compute checksums for all of them in
- # parallel. When checksums diverge, split the files into smaller
- # groups. If a group becomes just a single file, that file is not
- # a duplicate and can be dropped completely.
def duplicates(self):
skip = [size for size in self.by_size if len(self.by_size[size]) == 1]
for size in skip:
@@ -98,6 +92,13 @@ class DuplicateFileFinder(object):
of pathnames to files that are identical.
'''
+
+ # FIXME: This assumes it can open every file at the same time.
+ # If there are a lot of files, that's not going to be possible.
+ # This might work: keep a pool of open files, and record how
+ # far you got with each file. Then close and re-open files as
+ # necessary, if the pool is too small. When re-opening, seek
+ # to the remembered position.
result = []
identical_groups = [[(x, file(x)) for x in pathnames]]