From bedcaf5add86438c1fcd715070053e1e5370fdfa Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 18 Apr 2010 08:00:34 +1200 Subject: Initial import. --- dupfiles | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100755 dupfiles (limited to 'dupfiles') diff --git a/dupfiles b/dupfiles new file mode 100755 index 0000000..6c0d2b8 --- /dev/null +++ b/dupfiles @@ -0,0 +1,69 @@ +#!/usr/bin/python +# +# Find duplicate files and do something with them. +# Copyright 2010 Lars Wirzenius +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +import hashlib +import os +import sys + + +class DuplicateFileFinder(object): + + def __init__(self): + self.by_size = dict() + + def collect(self, root): + for dirname, subdirs, filenames in os.walk(root): + pathnames = [os.path.join(dirname, f) for f in filenames] + for pathname in pathnames: + stat = os.stat(pathname) + if stat.st_size in self.by_size: + dev, ino, pathnames = self.by_size[stat.st_size] + if stat.st_dev != dev or stat.st_ino != ino: + pathnames.add(pathname) + else: + self.by_size[stat.st_size] = (stat.st_dev, stat.st_ino, + set([pathname])) + + def duplicates(self): + for dev, ino, pathnames in self.by_size.itervalues(): + by_checksum = dict() + for pathname in pathnames: + checksum = self.file_checksum(pathname) + if checksum not in by_checksum: + by_checksum[checksum] = set() + by_checksum[checksum].add(pathname) + for names in by_checksum.itervalues(): + if len(names) > 1: + yield names + + def file_checksum(self, pathname): + return hashlib.md5(file(pathname, 'rb').read()).digest() + + +def main(): + dupfinder = DuplicateFileFinder() + for dirname in sys.argv[1:]: + dupfinder.collect(dirname) + for duplicates in dupfinder.duplicates(): + print '\n'.join(duplicates) + print + + +if __name__ == '__main__': + main() -- cgit v1.2.1