diff options
-rwxr-xr-x | dupfiles | 75 |
1 files changed, 53 insertions, 22 deletions
@@ -87,20 +87,62 @@ class DuplicateFileFinder(object): if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1: # All duplicates are hardlinks to the same inode. Skip. continue - by_checksum = dict() - for dev, ino, pathname in tuples: - checksum = self.file_checksum(pathname) - if checksum not in by_checksum: - by_checksum[checksum] = set() - by_checksum[checksum].add(pathname) - done_bytes += size - self.duplicates_progress(done_bytes, total_bytes, start_time) - for names in by_checksum.itervalues(): - if len(names) > 1: - result.append(names) + result += self.find_duplicates([p for d, i, p in tuples]) self.progress.finished() return result + def find_duplicates(self, pathnames): + '''Find sets of duplicate files. + + Return list of groups of identical files, where each group is a list + of pathnames to files that are identical. + + ''' + + result = [] + identical_groups = [[(x, file(x)) for x in pathnames]] + + while identical_groups: + new = [] + for group in identical_groups: + done, not_done = self.read_next_chunks(group) + if len(done) > 1: + result.append(done) + while not_done: + key = not_done[0][2] + temp2 = [t for t in not_done if t[2] == key] + not_done = [t for t in not_done if t[2] != key] + if len(temp2) > 1: + new.append([(pathname, f) + for pathname, f, data in temp2]) + identical_groups = new + + return result + + def read_next_chunks(self, group): + '''Read next chunk of data in each file. + + group is a list of tuples (pathname, open file). A chunk of data + is read from each file. + + Return value is two lists: one with filenames that reached the + end of the file, and one with tuples (pathname, open file, + chunk). + + ''' + + done = [] + not_done = [] + chunk_size = 64 * 1024 + for pathname, f in group: + data = f.read(chunk_size) + if not data: + f.close() + done.append(pathname) + else: + not_done.append((pathname, f, data)) + return done, not_done + def duplicates_progress(self, done, total, started): duration = time.time() - started if duration < 1: @@ -121,17 +163,6 @@ class DuplicateFileFinder(object): return '%.1f %s' % (float(size) / float(limit), unit) return '0 B' - def file_checksum(self, pathname): - cs = hashlib.md5() - f = file(pathname, 'rb') - while True: - data = f.read(64*1024) - if not data: - break - cs.update(data) - f.close() - return cs.digest() - def make_hardlinks(duplicates): canonical = duplicates.pop() |