summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-25 06:33:14 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-25 06:33:14 +1200
commitf8b8501cba8546ecad422a6a41dca944bb3e94a5 (patch)
tree3a3a5fdd6c45874b585c985212aa0dd66eab0be7
parent2fc01f7cad3f0df92aa08b22072ee51f3b1b9db4 (diff)
downloaddupfiles-f8b8501cba8546ecad422a6a41dca944bb3e94a5.tar.gz
Compare files chunk-by-chunk instead of checksums for whole files.
This should be way more effective when there are a lot of files. We now stop comparing as soon as there is a difference. We can't handle it if too many files of the same size need to be opened, but that's a problem for another day.
-rwxr-xr-xdupfiles75
1 files changed, 53 insertions, 22 deletions
diff --git a/dupfiles b/dupfiles
index dd0364a..abcd67d 100755
--- a/dupfiles
+++ b/dupfiles
@@ -87,20 +87,62 @@ class DuplicateFileFinder(object):
if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1:
# All duplicates are hardlinks to the same inode. Skip.
continue
- by_checksum = dict()
- for dev, ino, pathname in tuples:
- checksum = self.file_checksum(pathname)
- if checksum not in by_checksum:
- by_checksum[checksum] = set()
- by_checksum[checksum].add(pathname)
- done_bytes += size
- self.duplicates_progress(done_bytes, total_bytes, start_time)
- for names in by_checksum.itervalues():
- if len(names) > 1:
- result.append(names)
+ result += self.find_duplicates([p for d, i, p in tuples])
self.progress.finished()
return result
+ def find_duplicates(self, pathnames):
+ '''Find sets of duplicate files.
+
+ Return list of groups of identical files, where each group is a list
+ of pathnames to files that are identical.
+
+ '''
+
+ result = []
+ identical_groups = [[(x, file(x)) for x in pathnames]]
+
+ while identical_groups:
+ new = []
+ for group in identical_groups:
+ done, not_done = self.read_next_chunks(group)
+ if len(done) > 1:
+ result.append(done)
+ while not_done:
+ key = not_done[0][2]
+ temp2 = [t for t in not_done if t[2] == key]
+ not_done = [t for t in not_done if t[2] != key]
+ if len(temp2) > 1:
+ new.append([(pathname, f)
+ for pathname, f, data in temp2])
+ identical_groups = new
+
+ return result
+
+ def read_next_chunks(self, group):
+ '''Read next chunk of data in each file.
+
+ group is a list of tuples (pathname, open file). A chunk of data
+ is read from each file.
+
+ Return value is two lists: one with filenames that reached the
+ end of the file, and one with tuples (pathname, open file,
+ chunk).
+
+ '''
+
+ done = []
+ not_done = []
+ chunk_size = 64 * 1024
+ for pathname, f in group:
+ data = f.read(chunk_size)
+ if not data:
+ f.close()
+ done.append(pathname)
+ else:
+ not_done.append((pathname, f, data))
+ return done, not_done
+
def duplicates_progress(self, done, total, started):
duration = time.time() - started
if duration < 1:
@@ -121,17 +163,6 @@ class DuplicateFileFinder(object):
return '%.1f %s' % (float(size) / float(limit), unit)
return '0 B'
- def file_checksum(self, pathname):
- cs = hashlib.md5()
- f = file(pathname, 'rb')
- while True:
- data = f.read(64*1024)
- if not data:
- break
- cs.update(data)
- f.close()
- return cs.digest()
-
def make_hardlinks(duplicates):
canonical = duplicates.pop()