summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2010-04-18 08:00:34 +1200
committerLars Wirzenius <liw@liw.fi>2010-04-18 08:00:34 +1200
commitbedcaf5add86438c1fcd715070053e1e5370fdfa (patch)
tree671da14dad0a0f50a03543dcff67526debf3c7d1
downloaddupfiles-bedcaf5add86438c1fcd715070053e1e5370fdfa.tar.gz
Initial import.
-rwxr-xr-xdupfiles69
-rw-r--r--test-data/bar1
-rw-r--r--test-data/foo1
-rw-r--r--test-data/foobar1
-rw-r--r--test-data/hardlink-to-foo1
5 files changed, 73 insertions, 0 deletions
diff --git a/dupfiles b/dupfiles
new file mode 100755
index 0000000..6c0d2b8
--- /dev/null
+++ b/dupfiles
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+#
+# Find duplicate files and do something with them.
+# Copyright 2010 Lars Wirzenius
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import hashlib
+import os
+import sys
+
+
+class DuplicateFileFinder(object):
+
+ def __init__(self):
+ self.by_size = dict()
+
+ def collect(self, root):
+ for dirname, subdirs, filenames in os.walk(root):
+ pathnames = [os.path.join(dirname, f) for f in filenames]
+ for pathname in pathnames:
+ stat = os.stat(pathname)
+ if stat.st_size in self.by_size:
+ dev, ino, pathnames = self.by_size[stat.st_size]
+ if stat.st_dev != dev or stat.st_ino != ino:
+ pathnames.add(pathname)
+ else:
+ self.by_size[stat.st_size] = (stat.st_dev, stat.st_ino,
+ set([pathname]))
+
+ def duplicates(self):
+ for dev, ino, pathnames in self.by_size.itervalues():
+ by_checksum = dict()
+ for pathname in pathnames:
+ checksum = self.file_checksum(pathname)
+ if checksum not in by_checksum:
+ by_checksum[checksum] = set()
+ by_checksum[checksum].add(pathname)
+ for names in by_checksum.itervalues():
+ if len(names) > 1:
+ yield names
+
+ def file_checksum(self, pathname):
+ return hashlib.md5(file(pathname, 'rb').read()).digest()
+
+
+def main():
+ dupfinder = DuplicateFileFinder()
+ for dirname in sys.argv[1:]:
+ dupfinder.collect(dirname)
+ for duplicates in dupfinder.duplicates():
+ print '\n'.join(duplicates)
+ print
+
+
+if __name__ == '__main__':
+ main()
diff --git a/test-data/bar b/test-data/bar
new file mode 100644
index 0000000..257cc56
--- /dev/null
+++ b/test-data/bar
@@ -0,0 +1 @@
+foo
diff --git a/test-data/foo b/test-data/foo
new file mode 100644
index 0000000..257cc56
--- /dev/null
+++ b/test-data/foo
@@ -0,0 +1 @@
+foo
diff --git a/test-data/foobar b/test-data/foobar
new file mode 100644
index 0000000..323fae0
--- /dev/null
+++ b/test-data/foobar
@@ -0,0 +1 @@
+foobar
diff --git a/test-data/hardlink-to-foo b/test-data/hardlink-to-foo
new file mode 100644
index 0000000..257cc56
--- /dev/null
+++ b/test-data/hardlink-to-foo
@@ -0,0 +1 @@
+foo