Compare files chunk-by-chunk instead of checksums for whole files.

This should be way more effective when there are a lot of files. We now stop comparing as soon as there is a difference. We can't handle it if too many files of the same size need to be opened, but that's a problem for another day.
author: Lars Wirzenius <liw@liw.fi> 2010-04-25 06:33:14 +1200
committer: Lars Wirzenius <liw@liw.fi> 2010-04-25 06:33:14 +1200
commit: f8b8501cba8546ecad422a6a41dca944bb3e94a5 (patch)
tree: 3a3a5fdd6c45874b585c985212aa0dd66eab0be7
parent: 2fc01f7cad3f0df92aa08b22072ee51f3b1b9db4 (diff)
download: dupfiles-f8b8501cba8546ecad422a6a41dca944bb3e94a5.tar.gz
1 files changed, 53 insertions, 22 deletions
diff --git a/dupfiles b/dupfiles
index dd0364a..abcd67d 100755
--- a/dupfiles
+++ b/dupfiles
@@ -87,20 +87,62 @@ class DuplicateFileFinder(object):
             if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1:
                 # All duplicates are hardlinks to the same inode. Skip.
                 continue
-            by_checksum = dict()
-            for dev, ino, pathname in tuples:
-                checksum = self.file_checksum(pathname)
-                if checksum not in by_checksum:
-                    by_checksum[checksum] = set()
-                by_checksum[checksum].add(pathname)
-                done_bytes += size
-                self.duplicates_progress(done_bytes, total_bytes, start_time)
-            for names in by_checksum.itervalues():
-                if len(names) > 1:
-                    result.append(names)
+            result += self.find_duplicates([p for d, i, p in tuples])
         self.progress.finished()
         return result
 
+    def find_duplicates(self, pathnames):
+        '''Find sets of duplicate files.
+        
+        Return list of groups of identical files, where each group is a list
+        of pathnames to files that are identical.
+        
+        '''
+
+        result = []
+        identical_groups = [[(x, file(x)) for x in pathnames]]
+
+        while identical_groups:
+            new = []
+            for group in identical_groups:
+                done, not_done = self.read_next_chunks(group)
+                if len(done) > 1:
+                    result.append(done)
+                while not_done:
+                    key = not_done[0][2]
+                    temp2 =    [t for t in not_done if t[2] == key]
+                    not_done = [t for t in not_done if t[2] != key]
+                    if len(temp2) > 1:
+                        new.append([(pathname, f) 
+                                    for pathname, f, data in temp2])
+            identical_groups = new
+
+        return result
+
+    def read_next_chunks(self, group):
+        '''Read next chunk of data in each file.
+        
+        group is a list of tuples (pathname, open file). A chunk of data
+        is read from each file.
+        
+        Return value is two lists: one with filenames that reached the
+        end of the file, and one with tuples (pathname, open file,
+        chunk).
+        
+        '''
+        
+        done = []
+        not_done = []
+        chunk_size = 64 * 1024
+        for pathname, f in group:
+            data = f.read(chunk_size)
+            if not data:
+                f.close()
+                done.append(pathname)
+            else:
+                not_done.append((pathname, f, data))
+        return done, not_done
+
     def duplicates_progress(self, done, total, started):
         duration = time.time() - started
         if duration < 1:
@@ -121,17 +163,6 @@ class DuplicateFileFinder(object):
                 return '%.1f %s' % (float(size) / float(limit), unit)
         return '0 B'
 
-    def file_checksum(self, pathname):
-        cs = hashlib.md5()
-        f = file(pathname, 'rb')
-        while True:
-            data = f.read(64*1024)
-            if not data:
-                break
-            cs.update(data)
-        f.close()
-        return cs.digest()
-
 
 def make_hardlinks(duplicates):
     canonical = duplicates.pop()
author	Lars Wirzenius <liw@liw.fi>	2010-04-25 06:33:14 +1200
committer	Lars Wirzenius <liw@liw.fi>	2010-04-25 06:33:14 +1200
commit	f8b8501cba8546ecad422a6a41dca944bb3e94a5 (patch)
tree	3a3a5fdd6c45874b585c985212aa0dd66eab0be7
parent	2fc01f7cad3f0df92aa08b22072ee51f3b1b9db4 (diff)
download	dupfiles-f8b8501cba8546ecad422a6a41dca944bb3e94a5.tar.gz