diff options
author | Lars Wirzenius <liw@liw.fi> | 2010-04-25 06:33:14 +1200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2010-04-25 06:33:14 +1200 |
commit | f8b8501cba8546ecad422a6a41dca944bb3e94a5 (patch) | |
tree | 3a3a5fdd6c45874b585c985212aa0dd66eab0be7 | |
parent | 2fc01f7cad3f0df92aa08b22072ee51f3b1b9db4 (diff) | |
download | dupfiles-f8b8501cba8546ecad422a6a41dca944bb3e94a5.tar.gz |
Compare files chunk-by-chunk instead of checksums for whole files.
This should be way more effective when there are a lot of files.
We now stop comparing as soon as there is a difference.
We can't handle it if too many files of the same size need to be opened,
but that's a problem for another day.
-rwxr-xr-x | dupfiles | 75 |
1 files changed, 53 insertions, 22 deletions
@@ -87,20 +87,62 @@ class DuplicateFileFinder(object): if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1: # All duplicates are hardlinks to the same inode. Skip. continue - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - done_bytes += size - self.duplicates_progress(done_bytes, total_bytes, start_time) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) + result += self.find_duplicates([p for d, i, p in tuples]) self.progress.finished() return result + def find_duplicates(self, pathnames): + '''Find sets of duplicate files. + + Return list of groups of identical files, where each group is a list + of pathnames to files that are identical. + + ''' + + result = [] + identical_groups = [[(x, file(x)) for x in pathnames]] + + while identical_groups: + new = [] + for group in identical_groups: + done, not_done = self.read_next_chunks(group) + if len(done) > 1: + result.append(done) + while not_done: + key = not_done[0][2] + temp2 = [t for t in not_done if t[2] == key] + not_done = [t for t in not_done if t[2] != key] + if len(temp2) > 1: + new.append([(pathname, f) + for pathname, f, data in temp2]) + identical_groups = new + + return result + + def read_next_chunks(self, group): + '''Read next chunk of data in each file. + + group is a list of tuples (pathname, open file). A chunk of data + is read from each file. + + Return value is two lists: one with filenames that reached the + end of the file, and one with tuples (pathname, open file, + chunk). + + ''' + + done = [] + not_done = [] + chunk_size = 64 * 1024 + for pathname, f in group: + data = f.read(chunk_size) + if not data: + f.close() + done.append(pathname) + else: + not_done.append((pathname, f, data)) + return done, not_done + def duplicates_progress(self, done, total, started): duration = time.time() - started if duration < 1: @@ -121,17 +163,6 @@ class DuplicateFileFinder(object): return '%.1f %s' % (float(size) / float(limit), unit) return '0 B' - def file_checksum(self, pathname): - cs = hashlib.md5() - f = file(pathname, 'rb') - while True: - data = f.read(64*1024) - if not data: - break - cs.update(data) - f.close() - return cs.digest() - def make_hardlinks(duplicates): canonical = duplicates.pop() |