summaryrefslogtreecommitdiff
path: root/dupfiles
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-18 08:00:34 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-18 08:00:34 +1200
commitbedcaf5add86438c1fcd715070053e1e5370fdfa (patch)
tree671da14dad0a0f50a03543dcff67526debf3c7d1 /dupfiles
downloaddupfiles-bedcaf5add86438c1fcd715070053e1e5370fdfa.tar.gz
Initial import.
Diffstat (limited to 'dupfiles')
-rwxr-xr-xdupfiles69
1 files changed, 69 insertions, 0 deletions
diff --git a/dupfiles b/dupfiles
new file mode 100755
index 0000000..6c0d2b8
--- /dev/null
+++ b/dupfiles
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+#
+# Find duplicate files and do something with them.
+# Copyright 2010 Lars Wirzenius
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import hashlib
+import os
+import sys
+
+
+class DuplicateFileFinder(object):
+
+ def __init__(self):
+ self.by_size = dict()
+
+ def collect(self, root):
+ for dirname, subdirs, filenames in os.walk(root):
+ pathnames = [os.path.join(dirname, f) for f in filenames]
+ for pathname in pathnames:
+ stat = os.stat(pathname)
+ if stat.st_size in self.by_size:
+ dev, ino, pathnames = self.by_size[stat.st_size]
+ if stat.st_dev != dev or stat.st_ino != ino:
+ pathnames.add(pathname)
+ else:
+ self.by_size[stat.st_size] = (stat.st_dev, stat.st_ino,
+ set([pathname]))
+
+ def duplicates(self):
+ for dev, ino, pathnames in self.by_size.itervalues():
+ by_checksum = dict()
+ for pathname in pathnames:
+ checksum = self.file_checksum(pathname)
+ if checksum not in by_checksum:
+ by_checksum[checksum] = set()
+ by_checksum[checksum].add(pathname)
+ for names in by_checksum.itervalues():
+ if len(names) > 1:
+ yield names
+
+ def file_checksum(self, pathname):
+ return hashlib.md5(file(pathname, 'rb').read()).digest()
+
+
+def main():
+ dupfinder = DuplicateFileFinder()
+ for dirname in sys.argv[1:]:
+ dupfinder.collect(dirname)
+ for duplicates in dupfinder.duplicates():
+ print '\n'.join(duplicates)
+ print
+
+
+if __name__ == '__main__':
+ main()