#!/usr/bin/python
#
# Find duplicate files and do something with them.
# Copyright 2010  Lars Wirzenius
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import errno
import hashlib
import optparse
import os
import random
import stat
import sys
import time
import ttystatus


class FileStats(object):

    def __init__(self):
        self.open_count = 0
        self.close_count = 0
        self.hit_count = 0

filestats = FileStats()
filepool = dict()

class File(object):

    def __init__(self, pathname):
        self.pathname = pathname
        self.offset = 0

    def read(self, num_bytes):
        if self.pathname in filepool:
            f = filepool[self.pathname]
            filestats.hit_count += 1
        else:
            try:
                f = file(self.pathname)
            except IOError, e:
                if e.errno != errno.EMFILE:
                    raise
                victim = random.choice(filepool.keys())
                filepool[victim].close()
                del filepool[victim]
                filestats.close_count += 1
                f = file(self.pathname)
            f.seek(self.offset)
            filepool[self.pathname] = f
            filestats.open_count += 1
        data = f.read(num_bytes)
        self.offset += len(data)
        return data

    def close(self):
        if self.pathname in filepool:
            filepool[self.pathname].close()
            del filepool[self.pathname]


class DuplicateFileFinder(object):

    def __init__(self, progress):
        self.by_size = dict()
        self.progress = progress

    def collect(self, root):
        ts = ttystatus.TerminalStatus()
        if self.progress:
            ts.add(ttystatus.Literal('Scanning '))
            ts.add(ttystatus.Pathname('dirname'))

        for dirname, subdirs, filenames in os.walk(root):
            ts['dirname'] = dirname
            subdirs.sort()
            filenames.sort()
            pathnames = [os.path.join(dirname, f) for f in filenames]
            for pathname in pathnames:
                st = os.lstat(pathname)
                if stat.S_ISREG(st.st_mode):
                    t = (st.st_dev, st.st_ino, pathname)
                    if st.st_size in self.by_size:
                        self.by_size[st.st_size].append(t)
                    else:
                        self.by_size[st.st_size] = [t]
        ts.clear()

    def duplicates(self):
        total_bytes = sum(len(tuples) * size
                          for size, tuples in self.by_size.iteritems())

        ts = ttystatus.TerminalStatus(period=0.5)
        ts['done'] = 0
        if self.progress:
            ts.add(ttystatus.Literal('Comparing: '))
            ts.add(ttystatus.ByteSize('done'))
            ts.add(ttystatus.Literal('/'))
            ts.add(ttystatus.ByteSize('total'))
            ts.add(ttystatus.Literal(' ('))
            ts.add(ttystatus.PercentDone('done', 'total'))
            ts.add(ttystatus.Literal('), group '))
            ts.add(ttystatus.Counter('size'))
            ts.add(ttystatus.Literal('/'))
            ts.add(ttystatus.Literal(str(len(self.by_size))))
            ts.add(ttystatus.Literal(' ('))
            ts.add(ttystatus.ByteSize('size'))
            ts.add(ttystatus.Literal(')'))

        result = []
        ith = 0
        for size, tuples in sorted(self.by_size.iteritems()):
            ith += 1
            if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1:
                # All files are not hardlinks to the same inode.
                # (This also excludes groups with just one file.)
                result += self.find_duplicates([p for d, i, p in tuples])
            ts['size'] = size
            ts['done'] += len(tuples) * size
            ts['total'] = total_bytes

        if self.progress:
            ts.finish()

        return result

    def find_duplicates(self, pathnames):
        '''Find sets of duplicate files.
        
        Return list of groups of identical files, where each group is a list
        of pathnames to files that are identical.
        
        '''
        
        result = []
        identical_groups = [[(x, File(x)) for x in pathnames]]

        while identical_groups:
            new = []
            for group in identical_groups:
                done, not_done = self.read_next_chunks(group)
                if len(done) > 1:
                    result.append(done)
                while not_done:
                    key = not_done[0][2]
                    temp2 =    [t for t in not_done if t[2] == key]
                    not_done = [t for t in not_done if t[2] != key]
                    if len(temp2) > 1:
                        new.append([(pathname, f) 
                                    for pathname, f, data in temp2])
            identical_groups = new

        return result

    def read_next_chunks(self, group):
        '''Read next chunk of data in each file.
        
        group is a list of tuples (pathname, open file). A chunk of data
        is read from each file.
        
        Return value is two lists: one with filenames that reached the
        end of the file, and one with tuples (pathname, open file,
        chunk).
        
        '''
        
        done = []
        not_done = []
        chunk_size = 64 * 1024
        for pathname, f in group:
            data = f.read(chunk_size)
            if not data:
                f.close()
                done.append(pathname)
            else:
                not_done.append((pathname, f, data))
        return done, not_done


def make_hardlinks(duplicates):
    canonical = duplicates.pop()
    for pathname in duplicates:
        os.remove(pathname)
        os.link(canonical, pathname)


def report(duplicates):
    sys.stdout.write('\n'.join(duplicates))
    sys.stdout.write('\n\n')


def main():
    parser = optparse.OptionParser()
    parser.add_option('--make-hardlinks', action='store_true',
                      help='hardlink duplicate files to each other')
    parser.add_option('--progress', action='store_true',
                      help='report progress')

    opts, args = parser.parse_args()

    dupfinder = DuplicateFileFinder(opts.progress)
    for dirname in sorted(args):
        dupfinder.collect(dirname)

    for duplicates in dupfinder.duplicates():
        if opts.make_hardlinks:
            make_hardlinks(duplicates)
        else:
            report(duplicates)


if __name__ == '__main__':
    main()