#!/usr/bin/python # # Find duplicate files and do something with them. # Copyright 2010 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import cliapp import errno import hashlib import optparse import os import random import stat import sys import time import ttystatus version = '5.3' class FileStats(object): def __init__(self): self.open_count = 0 self.close_count = 0 self.hit_count = 0 filestats = FileStats() filepool = dict() class File(object): def __init__(self, pathname): self.pathname = pathname self.offset = 0 def read(self, num_bytes): if self.pathname in filepool: fd = filepool[self.pathname] filestats.hit_count += 1 else: try: fd = os.open(self.pathname, os.O_RDONLY) except OSError, e: if e.errno != errno.EMFILE: raise victim = random.choice(filepool.keys()) os.close(filepool[victim]) del filepool[victim] filestats.close_count += 1 fd = os.open(self.pathname, os.O_RDONLY) os.lseek(fd, self.offset, os.SEEK_SET) filepool[self.pathname] = fd filestats.open_count += 1 data = os.read(fd, num_bytes) self.offset += len(data) return data def close(self): if self.pathname in filepool: os.close(filepool[self.pathname]) del filepool[self.pathname] filestats.close_count += 1 class DuplicateFileFinder(object): def __init__(self, progress): self.by_size = dict() self.progress = progress def collect(self, root): ts = ttystatus.TerminalStatus() if self.progress: ts.add(ttystatus.Literal('Scanning ')) ts.add(ttystatus.Pathname('dirname')) for dirname, subdirs, filenames in os.walk(root): ts['dirname'] = dirname subdirs.sort() filenames.sort() pathnames = [os.path.join(dirname, f) for f in filenames] for pathname in pathnames: st = os.lstat(pathname) if stat.S_ISREG(st.st_mode): t = (st.st_dev, st.st_ino, pathname) if st.st_size in self.by_size: self.by_size[st.st_size].append(t) else: self.by_size[st.st_size] = [t] ts.clear() def duplicates(self): total_bytes = sum(len(tuples) * size for size, tuples in self.by_size.iteritems()) ts = ttystatus.TerminalStatus(period=0.5) ts['done'] = 0 if self.progress: ts.add(ttystatus.Literal('Comparing: ')) ts.add(ttystatus.ByteSize('done')) ts.add(ttystatus.Literal('/')) ts.add(ttystatus.ByteSize('total')) ts.add(ttystatus.Literal(' (')) ts.add(ttystatus.PercentDone('done', 'total')) ts.add(ttystatus.Literal('), group ')) ts.add(ttystatus.Counter('size')) ts.add(ttystatus.Literal('/')) ts.add(ttystatus.Literal(str(len(self.by_size)))) ts.add(ttystatus.Literal(' (')) ts.add(ttystatus.ByteSize('size')) ts.add(ttystatus.Literal(')')) result = [] ith = 0 for size, tuples in sorted(self.by_size.iteritems()): ith += 1 if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1: # All files are not hardlinks to the same inode. # (This also excludes groups with just one file.) result += self.find_duplicates([p for d, i, p in tuples]) ts['size'] = size ts['done'] += len(tuples) * size ts['total'] = total_bytes if self.progress: ts.finish() return result def find_duplicates(self, pathnames): '''Find sets of duplicate files. Return list of groups of identical files, where each group is a list of pathnames to files that are identical. ''' result = [] identical_groups = [[(x, File(x)) for x in pathnames]] while identical_groups: new = [] for group in identical_groups: done, not_done = self.read_next_chunks(group) if len(done) > 1: result.append(done) while not_done: key = not_done[0][2] temp2 = [t for t in not_done if t[2] == key] not_done = [t for t in not_done if t[2] != key] if len(temp2) > 1: new.append([(pathname, f) for pathname, f, data in temp2]) identical_groups = new return result def read_next_chunks(self, group): '''Read next chunk of data in each file. group is a list of tuples (pathname, open file). A chunk of data is read from each file. Return value is two lists: one with filenames that reached the end of the file, and one with tuples (pathname, open file, chunk). ''' done = [] not_done = [] chunk_size = 4 * 1024 for pathname, f in group: data = f.read(chunk_size) if not data: f.close() done.append(pathname) else: not_done.append((pathname, f, data)) return done, not_done class Dupfiles(cliapp.Application): def add_settings(self): self.settings.boolean(['make-hardlinks'], 'hardlink duplicate files to each other') self.settings.boolean(['progress'], 'report progress') self.settings.boolean(['remove'], 'remove all but one copy of identical files') def process_args(self, args): dupfinder = DuplicateFileFinder(self.settings['progress']) for dirname in sorted(args): dupfinder.collect(dirname) for duplicates in dupfinder.duplicates(): if self.settings['make-hardlinks']: self.make_hardlinks(duplicates) elif self.settings['remove']: self.remove_all_but_one(duplicates) else: self.report(duplicates) def get_meta(self, name): st = os.lstat(name) return st.st_uid, st.st_gid, st.st_mode def make_hardlinks(self, duplicates): dups = [(name, self.get_meta(name)) for name in duplicates] while dups: canonical, meta = dups[0] dups = dups[1:] same = [x for x, y in dups if y == meta] dups = [(x,y) for x, y in dups if y != meta] for pathname in same: os.remove(pathname) os.link(canonical, pathname) def remove_all_but_one(self, duplicates): keep = duplicates.pop() for pathname in duplicates: os.remove(pathname) def report(self, duplicates): sys.stdout.write('\n'.join(duplicates)) sys.stdout.write('\n\n') if __name__ == '__main__': Dupfiles(version=version).run()