#!/usr/bin/python # # Find duplicate files and do something with them. # Copyright 2010 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import errno import hashlib import optparse import os import random import stat import sys import time import ttystatus class FileStats(object): def __init__(self): self.open_count = 0 self.close_count = 0 self.hit_count = 0 filestats = FileStats() filepool = dict() class File(object): def __init__(self, pathname): self.pathname = pathname self.offset = 0 def read(self, num_bytes): if self.pathname in filepool: f = filepool[self.pathname] filestats.hit_count += 1 else: try: f = file(self.pathname) except IOError, e: if e.errno != errno.EMFILE: raise victim = random.choice(filepool.keys()) filepool[victim].close() del filepool[victim] filestats.close_count += 1 f = file(self.pathname) f.seek(self.offset) filepool[self.pathname] = f filestats.open_count += 1 data = f.read(num_bytes) self.offset += len(data) return data def close(self): if self.pathname in filepool: filepool[self.pathname].close() del filepool[self.pathname] class DuplicateFileFinder(object): def __init__(self, progress): self.by_size = dict() self.progress = progress def collect(self, root): ts = ttystatus.TerminalStatus() if self.progress: ts.add(ttystatus.Literal('Scanning ')) ts.add(ttystatus.Pathname('dirname')) for dirname, subdirs, filenames in os.walk(root): ts['dirname'] = dirname subdirs.sort() filenames.sort() pathnames = [os.path.join(dirname, f) for f in filenames] for pathname in pathnames: st = os.lstat(pathname) if stat.S_ISREG(st.st_mode): t = (st.st_dev, st.st_ino, pathname) if st.st_size in self.by_size: self.by_size[st.st_size].append(t) else: self.by_size[st.st_size] = [t] ts.clear() def duplicates(self): total_bytes = sum(len(tuples) * size for size, tuples in self.by_size.iteritems()) ts = ttystatus.TerminalStatus(period=0.5) ts['done'] = 0 if self.progress: ts.add(ttystatus.Literal('Comparing: ')) ts.add(ttystatus.ByteSize('done')) ts.add(ttystatus.Literal('/')) ts.add(ttystatus.ByteSize('total')) ts.add(ttystatus.Literal(' (')) ts.add(ttystatus.PercentDone('done', 'total')) ts.add(ttystatus.Literal('), group ')) ts.add(ttystatus.Counter('size')) ts.add(ttystatus.Literal('/')) ts.add(ttystatus.Literal(str(len(self.by_size)))) ts.add(ttystatus.Literal(' (')) ts.add(ttystatus.ByteSize('size')) ts.add(ttystatus.Literal(')')) result = [] ith = 0 for size, tuples in sorted(self.by_size.iteritems()): ith += 1 if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1: # All files are not hardlinks to the same inode. # (This also excludes groups with just one file.) result += self.find_duplicates([p for d, i, p in tuples]) ts['size'] = size ts['done'] += len(tuples) * size ts['total'] = total_bytes if self.progress: ts.finish() return result def find_duplicates(self, pathnames): '''Find sets of duplicate files. Return list of groups of identical files, where each group is a list of pathnames to files that are identical. ''' result = [] identical_groups = [[(x, File(x)) for x in pathnames]] while identical_groups: new = [] for group in identical_groups: done, not_done = self.read_next_chunks(group) if len(done) > 1: result.append(done) while not_done: key = not_done[0][2] temp2 = [t for t in not_done if t[2] == key] not_done = [t for t in not_done if t[2] != key] if len(temp2) > 1: new.append([(pathname, f) for pathname, f, data in temp2]) identical_groups = new return result def read_next_chunks(self, group): '''Read next chunk of data in each file. group is a list of tuples (pathname, open file). A chunk of data is read from each file. Return value is two lists: one with filenames that reached the end of the file, and one with tuples (pathname, open file, chunk). ''' done = [] not_done = [] chunk_size = 64 * 1024 for pathname, f in group: data = f.read(chunk_size) if not data: f.close() done.append(pathname) else: not_done.append((pathname, f, data)) return done, not_done def make_hardlinks(duplicates): canonical = duplicates.pop() for pathname in duplicates: os.remove(pathname) os.link(canonical, pathname) def report(duplicates): sys.stdout.write('\n'.join(duplicates)) sys.stdout.write('\n\n') def main(): parser = optparse.OptionParser() parser.add_option('--make-hardlinks', action='store_true', help='hardlink duplicate files to each other') parser.add_option('--progress', action='store_true', help='report progress') opts, args = parser.parse_args() dupfinder = DuplicateFileFinder(opts.progress) for dirname in sorted(args): dupfinder.collect(dirname) for duplicates in dupfinder.duplicates(): if opts.make_hardlinks: make_hardlinks(duplicates) else: report(duplicates) if __name__ == '__main__': main()