diff options
author | Lars Wirzenius <liw@liw.fi> | 2010-04-18 08:00:34 +1200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2010-04-18 08:00:34 +1200 |
commit | bedcaf5add86438c1fcd715070053e1e5370fdfa (patch) | |
tree | 671da14dad0a0f50a03543dcff67526debf3c7d1 /dupfiles | |
download | dupfiles-bedcaf5add86438c1fcd715070053e1e5370fdfa.tar.gz |
Initial import.
Diffstat (limited to 'dupfiles')
-rwxr-xr-x | dupfiles | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/dupfiles b/dupfiles new file mode 100755 index 0000000..6c0d2b8 --- /dev/null +++ b/dupfiles @@ -0,0 +1,69 @@ +#!/usr/bin/python +# +# Find duplicate files and do something with them. +# Copyright 2010 Lars Wirzenius +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +import hashlib +import os +import sys + + +class DuplicateFileFinder(object): + + def __init__(self): + self.by_size = dict() + + def collect(self, root): + for dirname, subdirs, filenames in os.walk(root): + pathnames = [os.path.join(dirname, f) for f in filenames] + for pathname in pathnames: + stat = os.stat(pathname) + if stat.st_size in self.by_size: + dev, ino, pathnames = self.by_size[stat.st_size] + if stat.st_dev != dev or stat.st_ino != ino: + pathnames.add(pathname) + else: + self.by_size[stat.st_size] = (stat.st_dev, stat.st_ino, + set([pathname])) + + def duplicates(self): + for dev, ino, pathnames in self.by_size.itervalues(): + by_checksum = dict() + for pathname in pathnames: + checksum = self.file_checksum(pathname) + if checksum not in by_checksum: + by_checksum[checksum] = set() + by_checksum[checksum].add(pathname) + for names in by_checksum.itervalues(): + if len(names) > 1: + yield names + + def file_checksum(self, pathname): + return hashlib.md5(file(pathname, 'rb').read()).digest() + + +def main(): + dupfinder = DuplicateFileFinder() + for dirname in sys.argv[1:]: + dupfinder.collect(dirname) + for duplicates in dupfinder.duplicates(): + print '\n'.join(duplicates) + print + + +if __name__ == '__main__': + main() |