#!/usr/bin/python # # Find duplicate files and do something with them. # Copyright 2010 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import errno import hashlib import optparse import os import random import stat import sys import time class FileStats(object): def __init__(self): self.open_count = 0 self.close_count = 0 self.hit_count = 0 filestats = FileStats() filepool = dict() class File(object): def __init__(self, pathname): self.pathname = pathname self.offset = 0 def read(self, num_bytes): if self.pathname in filepool: f = filepool[self.pathname] filestats.hit_count += 1 else: try: f = file(self.pathname) except IOError, e: if e.errno != errno.EMFILE: raise victim = random.choice(filepool.keys()) filepool[victim].close() del filepool[victim] filestats.close_count += 1 f = file(self.pathname) f.seek(self.offset) filepool[self.pathname] = f filestats.open_count += 1 data = f.read(num_bytes) self.offset += len(data) return data def close(self): if self.pathname in filepool: filepool[self.pathname].close() del filepool[self.pathname] class DuplicateFileFinder(object): def __init__(self, progress): self.by_size = dict() self.progress = progress def collect(self, root): if self.progress: sys.stderr.write('Scanning %s\n' % root) for dirname, subdirs, filenames in os.walk(root): subdirs.sort() filenames.sort() pathnames = [os.path.join(dirname, f) for f in filenames] for pathname in pathnames: st = os.lstat(pathname) if stat.S_ISREG(st.st_mode): t = (st.st_dev, st.st_ino, pathname) if st.st_size in self.by_size: self.by_size[st.st_size].append(t) else: self.by_size[st.st_size] = [t] def duplicates(self): if self.progress: sys.stderr.write('Looking for groups of files of same size\n') skip = [size for size in self.by_size if len(self.by_size[size]) == 1] for size in skip: del self.by_size[size] if self.progress: sys.stderr.write('Ignored %d groups of one file each\n' % len(skip)) sys.stderr.write('There are %d groups of files of same size\n' % len(self.by_size)) total_bytes = sum(len(tuples) * size for size, tuples in self.by_size.iteritems()) result = [] done_bytes = 0 ith_group = 0 for size, tuples in sorted(self.by_size.iteritems()): ith_group += 1 if self.progress: sys.stderr.write('Group %d/%d (%d files of %d bytes)\n' % (ith_group, len(self.by_size), len(tuples), size)) if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1: # All duplicates are hardlinks to the same inode. Skip. done_bytes += len(tuples) * size else: new_dups = self.find_duplicates([p for d, i, p in tuples]) result += new_dups done_bytes += len(tuples) * size return result def find_duplicates(self, pathnames): '''Find sets of duplicate files. Return list of groups of identical files, where each group is a list of pathnames to files that are identical. ''' result = [] identical_groups = [[(x, File(x)) for x in pathnames]] while identical_groups: new = [] for group in identical_groups: done, not_done = self.read_next_chunks(group) if len(done) > 1: result.append(done) while not_done: key = not_done[0][2] temp2 = [t for t in not_done if t[2] == key] not_done = [t for t in not_done if t[2] != key] if len(temp2) > 1: new.append([(pathname, f) for pathname, f, data in temp2]) identical_groups = new return result def read_next_chunks(self, group): '''Read next chunk of data in each file. group is a list of tuples (pathname, open file). A chunk of data is read from each file. Return value is two lists: one with filenames that reached the end of the file, and one with tuples (pathname, open file, chunk). ''' done = [] not_done = [] chunk_size = 64 * 1024 for pathname, f in group: data = f.read(chunk_size) if not data: f.close() done.append(pathname) else: not_done.append((pathname, f, data)) return done, not_done def make_hardlinks(duplicates): canonical = duplicates.pop() for pathname in duplicates: os.remove(pathname) os.link(canonical, pathname) def report(duplicates): sys.stdout.write('\n'.join(duplicates)) sys.stdout.write('\n\n') def main(): parser = optparse.OptionParser() parser.add_option('--make-hardlinks', action='store_true', help='hardlink duplicate files to each other') parser.add_option('--progress', action='store_true', help='report progress') opts, args = parser.parse_args() dupfinder = DuplicateFileFinder(opts.progress) for dirname in sorted(args): dupfinder.collect(dirname) for duplicates in dupfinder.duplicates(): if opts.make_hardlinks: make_hardlinks(duplicates) else: report(duplicates) if __name__ == '__main__': main()