From bedcaf5add86438c1fcd715070053e1e5370fdfa Mon Sep 17 00:00:00 2001
From: Lars Wirzenius <liw@liw.fi>
Date: Sun, 18 Apr 2010 08:00:34 +1200
Subject: Initial import.

---
 dupfiles | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100755 dupfiles

(limited to 'dupfiles')

diff --git a/dupfiles b/dupfiles
new file mode 100755
index 0000000..6c0d2b8
--- /dev/null
+++ b/dupfiles
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+#
+# Find duplicate files and do something with them.
+# Copyright 2010  Lars Wirzenius
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import hashlib
+import os
+import sys
+
+
+class DuplicateFileFinder(object):
+
+    def __init__(self):
+        self.by_size = dict()
+        
+    def collect(self, root):
+        for dirname, subdirs, filenames in os.walk(root):
+            pathnames = [os.path.join(dirname, f) for f in filenames]
+            for pathname in pathnames:
+                stat = os.stat(pathname)
+                if stat.st_size in self.by_size:
+                    dev, ino, pathnames = self.by_size[stat.st_size]
+                    if stat.st_dev != dev or stat.st_ino != ino:
+                        pathnames.add(pathname)
+                else:
+                    self.by_size[stat.st_size] = (stat.st_dev, stat.st_ino,
+                                                  set([pathname]))
+
+    def duplicates(self):
+        for dev, ino, pathnames in self.by_size.itervalues():
+            by_checksum = dict()
+            for pathname in pathnames:
+                checksum = self.file_checksum(pathname)
+                if checksum not in by_checksum:
+                    by_checksum[checksum] = set()
+                by_checksum[checksum].add(pathname)
+            for names in by_checksum.itervalues():
+                if len(names) > 1:
+                    yield names
+
+    def file_checksum(self, pathname):
+        return hashlib.md5(file(pathname, 'rb').read()).digest()
+
+
+def main():
+    dupfinder = DuplicateFileFinder()
+    for dirname in sys.argv[1:]:
+        dupfinder.collect(dirname)
+    for duplicates in dupfinder.duplicates():
+        print '\n'.join(duplicates)
+        print
+
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.1