#!/usr/bin/python
#
# Find duplicate files and do something with them.
# Copyright 2010  Lars Wirzenius
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import errno
import hashlib
import optparse
import os
import random
import stat
import sys
import time


class FileStats(object):

    def __init__(self):
        self.open_count = 0
        self.close_count = 0
        self.hit_count = 0

filestats = FileStats()
filepool = dict()

class File(object):

    def __init__(self, pathname):
        self.pathname = pathname
        self.offset = 0

    def read(self, num_bytes):
        if self.pathname in filepool:
            f = filepool[self.pathname]
            filestats.hit_count += 1
        else:
            try:
                f = file(self.pathname)
            except IOError, e:
                if e.errno != errno.EMFILE:
                    raise
                victim = random.choice(filepool.keys())
                filepool[victim].close()
                del filepool[victim]
                filestats.close_count += 1
                f = file(self.pathname)
            f.seek(self.offset)
            filepool[self.pathname] = f
            filestats.open_count += 1
        data = f.read(num_bytes)
        self.offset += len(data)
        return data

    def close(self):
        if self.pathname in filepool:
            filepool[self.pathname].close()
            del filepool[self.pathname]


class DuplicateFileFinder(object):

    def __init__(self, progress):
        self.by_size = dict()
        self.progress = progress

    def collect(self, root):
        if self.progress:
            sys.stderr.write('Scanning %s\n' % root)

        for dirname, subdirs, filenames in os.walk(root):
            subdirs.sort()
            filenames.sort()
            pathnames = [os.path.join(dirname, f) for f in filenames]
            for pathname in pathnames:
                st = os.lstat(pathname)
                if stat.S_ISREG(st.st_mode):
                    t = (st.st_dev, st.st_ino, pathname)
                    if st.st_size in self.by_size:
                        self.by_size[st.st_size].append(t)
                    else:
                        self.by_size[st.st_size] = [t]

    def duplicates(self):
        if self.progress:
            sys.stderr.write('Looking for groups of files of same size\n')
        skip = [size for size in self.by_size if len(self.by_size[size]) == 1]
        for size in skip:
            del self.by_size[size]
        if self.progress:
            sys.stderr.write('Ignored %d groups of one file each\n' % len(skip))
            sys.stderr.write('There are %d groups of files of same size\n' %
                             len(self.by_size))

        total_bytes = sum(len(tuples) * size
                            for size, tuples in self.by_size.iteritems())
        result = []
        done_bytes = 0
        ith_group = 0
        for size, tuples in sorted(self.by_size.iteritems()):
            ith_group += 1
            if self.progress:
                sys.stderr.write('Group %d/%d (%d files of %d bytes)\n' %
                                 (ith_group, len(self.by_size), 
                                  len(tuples), size))
            if len(set((dev, ino) for dev, ino, pathname in tuples)) == 1:
                # All duplicates are hardlinks to the same inode. Skip.
                done_bytes += len(tuples) * size
            else:
                new_dups = self.find_duplicates([p for d, i, p in tuples])
                result += new_dups
                done_bytes += len(tuples) * size

        return result

    def find_duplicates(self, pathnames):
        '''Find sets of duplicate files.
        
        Return list of groups of identical files, where each group is a list
        of pathnames to files that are identical.
        
        '''
        
        result = []
        identical_groups = [[(x, File(x)) for x in pathnames]]

        while identical_groups:
            new = []
            for group in identical_groups:
                done, not_done = self.read_next_chunks(group)
                if len(done) > 1:
                    result.append(done)
                while not_done:
                    key = not_done[0][2]
                    temp2 =    [t for t in not_done if t[2] == key]
                    not_done = [t for t in not_done if t[2] != key]
                    if len(temp2) > 1:
                        new.append([(pathname, f) 
                                    for pathname, f, data in temp2])
            identical_groups = new

        return result

    def read_next_chunks(self, group):
        '''Read next chunk of data in each file.
        
        group is a list of tuples (pathname, open file). A chunk of data
        is read from each file.
        
        Return value is two lists: one with filenames that reached the
        end of the file, and one with tuples (pathname, open file,
        chunk).
        
        '''
        
        done = []
        not_done = []
        chunk_size = 64 * 1024
        for pathname, f in group:
            data = f.read(chunk_size)
            if not data:
                f.close()
                done.append(pathname)
            else:
                not_done.append((pathname, f, data))
        return done, not_done


def make_hardlinks(duplicates):
    canonical = duplicates.pop()
    for pathname in duplicates:
        os.remove(pathname)
        os.link(canonical, pathname)


def report(duplicates):
    sys.stdout.write('\n'.join(duplicates))
    sys.stdout.write('\n\n')


def main():
    parser = optparse.OptionParser()
    parser.add_option('--make-hardlinks', action='store_true',
                      help='hardlink duplicate files to each other')
    parser.add_option('--progress', action='store_true',
                      help='report progress')

    opts, args = parser.parse_args()

    dupfinder = DuplicateFileFinder(opts.progress)
    for dirname in sorted(args):
        dupfinder.collect(dirname)

    for duplicates in dupfinder.duplicates():
        if opts.make_hardlinks:
            make_hardlinks(duplicates)
        else:
            report(duplicates)


if __name__ == '__main__':
    main()