From f8b8501cba8546ecad422a6a41dca944bb3e94a5 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 25 Apr 2010 06:33:14 +1200 Subject: Compare files chunk-by-chunk instead of checksums for whole files. This should be way more effective when there are a lot of files. We now stop comparing as soon as there is a difference. We can't handle it if too many files of the same size need to be opened, but that's a problem for another day. --- dupfiles | 75 +++++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 22 deletions(-) diff --git a/dupfiles b/dupfiles index dd0364a..abcd67d 100755 --- a/dupfiles +++ b/dupfiles @@ -87,20 +87,62 @@ class DuplicateFileFinder(object): if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1: # All duplicates are hardlinks to the same inode. Skip. continue - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - done_bytes += size - self.duplicates_progress(done_bytes, total_bytes, start_time) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) + result += self.find_duplicates([p for d, i, p in tuples]) self.progress.finished() return result + def find_duplicates(self, pathnames): + '''Find sets of duplicate files. + + Return list of groups of identical files, where each group is a list + of pathnames to files that are identical. + + ''' + + result = [] + identical_groups = [[(x, file(x)) for x in pathnames]] + + while identical_groups: + new = [] + for group in identical_groups: + done, not_done = self.read_next_chunks(group) + if len(done) > 1: + result.append(done) + while not_done: + key = not_done[0][2] + temp2 = [t for t in not_done if t[2] == key] + not_done = [t for t in not_done if t[2] != key] + if len(temp2) > 1: + new.append([(pathname, f) + for pathname, f, data in temp2]) + identical_groups = new + + return result + + def read_next_chunks(self, group): + '''Read next chunk of data in each file. + + group is a list of tuples (pathname, open file). A chunk of data + is read from each file. + + Return value is two lists: one with filenames that reached the + end of the file, and one with tuples (pathname, open file, + chunk). + + ''' + + done = [] + not_done = [] + chunk_size = 64 * 1024 + for pathname, f in group: + data = f.read(chunk_size) + if not data: + f.close() + done.append(pathname) + else: + not_done.append((pathname, f, data)) + return done, not_done + def duplicates_progress(self, done, total, started): duration = time.time() - started if duration < 1: @@ -121,17 +163,6 @@ class DuplicateFileFinder(object): return '%.1f %s' % (float(size) / float(limit), unit) return '0 B' - def file_checksum(self, pathname): - cs = hashlib.md5() - f = file(pathname, 'rb') - while True: - data = f.read(64*1024) - if not data: - break - cs.update(data) - f.close() - return cs.digest() - def make_hardlinks(duplicates): canonical = duplicates.pop() -- cgit v1.2.1