From 626248d5a81c7bdc76655da0aafa02ed3ae64186 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 25 Apr 2010 06:34:49 +1200 Subject: Remove old FIXME, add new one. --- dupfiles | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'dupfiles') diff --git a/dupfiles b/dupfiles index abcd67d..35742f1 100755 --- a/dupfiles +++ b/dupfiles @@ -66,12 +66,6 @@ class DuplicateFileFinder(object): self.by_size[stat.st_size] = [t] self.progress.finished() - # FIXME: This computed the checksum for each file, the full file. - # It might be faster to read all files (of the same size) in parallel, - # in suitably small chunks, and compute checksums for all of them in - # parallel. When checksums diverge, split the files into smaller - # groups. If a group becomes just a single file, that file is not - # a duplicate and can be dropped completely. def duplicates(self): skip = [size for size in self.by_size if len(self.by_size[size]) == 1] for size in skip: @@ -98,6 +92,13 @@ class DuplicateFileFinder(object): of pathnames to files that are identical. ''' + + # FIXME: This assumes it can open every file at the same time. + # If there are a lot of files, that's not going to be possible. + # This might work: keep a pool of open files, and record how + # far you got with each file. Then close and re-open files as + # necessary, if the pool is too small. When re-opening, seek + # to the remembered position. result = [] identical_groups = [[(x, file(x)) for x in pathnames]] -- cgit v1.2.1