#!/usr/bin/python # # Find duplicate files and do something with them. # Copyright 2010 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import cliapp import errno import hashlib import optparse import os import random import stat import sys import time import ttystatus import dupfileslib class FileStats(object): def __init__(self): self.open_count = 0 self.close_count = 0 self.hit_count = 0 filestats = FileStats() filepool = dict() class File(object): def __init__(self, pathname): self.pathname = pathname self.offset = 0 def read(self, num_bytes): if self.pathname in filepool: fd = filepool[self.pathname] filestats.hit_count += 1 else: try: fd = os.open(self.pathname, os.O_RDONLY) except OSError, e: if e.errno != errno.EMFILE: raise victim = random.choice(filepool.keys()) os.close(filepool[victim]) del filepool[victim] filestats.close_count += 1 fd = os.open(self.pathname, os.O_RDONLY) os.lseek(fd, self.offset, os.SEEK_SET) filepool[self.pathname] = fd filestats.open_count += 1 data = os.read(fd, num_bytes) self.offset += len(data) return data def close(self): if self.pathname in filepool: os.close(filepool[self.pathname]) del filepool[self.pathname] filestats.close_count += 1 class DuplicateFileFinder(object): def __init__(self, progress, min_size, max_size): self.by_size = dict() self.progress = progress self.min_size = min_size self.max_size = max_size def size_ok(self, size): return (self.min_size <= size <= self.max_size or (size >= self.min_size and self.max_size == -1)) def collect(self, root): ts = ttystatus.TerminalStatus() if self.progress: ts.format('Scanning: %Counter(filename) found') ts.notify('Directory %s' % root) for dirname, subdirs, filenames in os.walk(root): subdirs.sort() filenames.sort() pathnames = [os.path.join(dirname, f) for f in filenames] for pathname in pathnames: ts['filename'] = pathname st = os.lstat(pathname) if stat.S_ISREG(st.st_mode) and self.size_ok(st.st_size): t = (st.st_dev, st.st_ino, pathname) if st.st_size in self.by_size: self.by_size[st.st_size].append(t) else: self.by_size[st.st_size] = [t] ts.finish() def duplicates(self): total_bytes = sum(len(tuples) * size for size, tuples in self.by_size.iteritems()) ts = ttystatus.TerminalStatus(period=0.5) ts['done'] = 0 if self.progress: ts.add(ttystatus.Literal('Comparing: ')) ts.add(ttystatus.ByteSize('done')) ts.add(ttystatus.Literal('/')) ts.add(ttystatus.ByteSize('total')) ts.add(ttystatus.Literal(' (')) ts.add(ttystatus.PercentDone('done', 'total')) ts.add(ttystatus.Literal('), group ')) ts.add(ttystatus.Counter('size')) ts.add(ttystatus.Literal('/')) ts.add(ttystatus.Literal(str(len(self.by_size)))) ts.add(ttystatus.Literal(' (')) ts.add(ttystatus.ByteSize('size')) ts.add(ttystatus.Literal(')')) result = [] ith = 0 for size, tuples in sorted(self.by_size.iteritems()): ith += 1 if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1: # All files are not hardlinks to the same inode. # (This also excludes groups with just one file.) result += self.find_duplicates([p for d, i, p in tuples]) ts['size'] = size ts['done'] += len(tuples) * size ts['total'] = total_bytes if self.progress: ts.finish() return result def find_duplicates(self, pathnames): '''Find sets of duplicate files. Return list of groups of identical files, where each group is a list of pathnames to files that are identical. ''' result = [] identical_groups = [[(x, File(x)) for x in pathnames]] while identical_groups: new = [] for group in identical_groups: done, not_done = self.read_next_chunks(group) if len(done) > 1: result.append(done) while not_done: key = not_done[0][2] temp2 = [t for t in not_done if t[2] == key] not_done = [t for t in not_done if t[2] != key] if len(temp2) > 1: new.append([(pathname, f) for pathname, f, data in temp2]) identical_groups = new return result def read_next_chunks(self, group): '''Read next chunk of data in each file. group is a list of tuples (pathname, open file). A chunk of data is read from each file. Return value is two lists: one with filenames that reached the end of the file, and one with tuples (pathname, open file, chunk). ''' done = [] not_done = [] chunk_size = 4 * 1024 for pathname, f in group: data = f.read(chunk_size) if not data: f.close() done.append(pathname) else: not_done.append((pathname, f, data)) return done, not_done class Dupfiles(cliapp.Application): def add_settings(self): self.settings.boolean(['make-hardlinks'], 'hardlink duplicate files to each other') self.settings.boolean(['progress'], 'report progress') self.settings.boolean(['remove'], 'remove all but one copy of identical files') self.settings.boolean(['no-act', 'dry-run', 'pretend'], 'do not really remove, just report what would ' 'be removed') self.settings.bytesize(['min-size'], 'compare files at least SIZE in size', metavar='SIZE') self.settings.bytesize(['max-size'], 'compare files at most SIZE in size ' '(-1 for infinity)', metavar='SIZE', default=-1) def process_args(self, args): dupfinder = DuplicateFileFinder(self.settings['progress'], self.settings['min-size'], self.settings['max-size']) for dirname in args: dupfinder.collect(dirname) dup_sets = dupfinder.duplicates() self.ts = ttystatus.TerminalStatus(period=0.1) if self.settings['remove']: num_files = sum(len(dups) for dups in dup_sets) self.ts['num_files'] = num_files self.ts.format('removing %Counter(filename)/%Integer(num_files)') for duplicates in dup_sets: if self.settings['make-hardlinks']: self.make_hardlinks(duplicates) elif self.settings['remove']: self.remove_all_but_one(duplicates, args[-1]) else: self.report(duplicates) def get_meta(self, name): st = os.lstat(name) return st.st_uid, st.st_gid, st.st_mode def make_hardlinks(self, duplicates): dups = [(name, self.get_meta(name)) for name in duplicates] while dups: canonical, meta = dups[0] dups = dups[1:] same = [x for x, y in dups if y == meta] dups = [(x,y) for x, y in dups if y != meta] for pathname in same: if self.settings['no-act']: self.output.write('hardlink %s\n' % pathname) else: os.remove(pathname) os.link(canonical, pathname) def remove_all_but_one(self, duplicates, last_dir): if not last_dir.endswith(os.sep): last_dir += os.sep for pathname in duplicates: if pathname.startswith(last_dir): keep = pathname duplicates.remove(keep) break else: keep = duplicates.pop() for pathname in duplicates: self.ts['filename'] = pathname if self.settings['no-act']: self.output.write('rm %s\n' % pathname) else: os.remove(pathname) self.ts.finish() def report(self, duplicates): sys.stdout.write('\n'.join(duplicates)) sys.stdout.write('\n\n') if __name__ == '__main__': Dupfiles(version=dupfileslib.__version__).run()