#!/usr/bin/python
#
# Find duplicate files and do something with them.
# Copyright 2010  Lars Wirzenius
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import cliapp
import errno
import hashlib
import optparse
import os
import random
import stat
import sys
import time
import ttystatus


version = '5.3'


class FileStats(object):

    def __init__(self):
        self.open_count = 0
        self.close_count = 0
        self.hit_count = 0

filestats = FileStats()
filepool = dict()

class File(object):

    def __init__(self, pathname):
        self.pathname = pathname
        self.offset = 0

    def read(self, num_bytes):
        if self.pathname in filepool:
            fd = filepool[self.pathname]
            filestats.hit_count += 1
        else:
            try:
                fd = os.open(self.pathname, os.O_RDONLY)
            except OSError, e:
                if e.errno != errno.EMFILE:
                    raise
                victim = random.choice(filepool.keys())
                os.close(filepool[victim])
                del filepool[victim]
                filestats.close_count += 1
                fd = os.open(self.pathname, os.O_RDONLY)
            os.lseek(fd, self.offset, os.SEEK_SET)
            filepool[self.pathname] = fd
            filestats.open_count += 1
        data = os.read(fd, num_bytes)
        self.offset += len(data)
        return data

    def close(self):
        if self.pathname in filepool:
            os.close(filepool[self.pathname])
            del filepool[self.pathname]
            filestats.close_count += 1


class DuplicateFileFinder(object):

    def __init__(self, progress):
        self.by_size = dict()
        self.progress = progress

    def collect(self, root):
        ts = ttystatus.TerminalStatus()
        if self.progress:
            ts.add(ttystatus.Literal('Scanning '))
            ts.add(ttystatus.Pathname('dirname'))

        for dirname, subdirs, filenames in os.walk(root):
            ts['dirname'] = dirname
            subdirs.sort()
            filenames.sort()
            pathnames = [os.path.join(dirname, f) for f in filenames]
            for pathname in pathnames:
                st = os.lstat(pathname)
                if stat.S_ISREG(st.st_mode):
                    t = (st.st_dev, st.st_ino, pathname)
                    if st.st_size in self.by_size:
                        self.by_size[st.st_size].append(t)
                    else:
                        self.by_size[st.st_size] = [t]
        ts.clear()

    def duplicates(self):
        total_bytes = sum(len(tuples) * size
                          for size, tuples in self.by_size.iteritems())

        ts = ttystatus.TerminalStatus(period=0.5)
        ts['done'] = 0
        if self.progress:
            ts.add(ttystatus.Literal('Comparing: '))
            ts.add(ttystatus.ByteSize('done'))
            ts.add(ttystatus.Literal('/'))
            ts.add(ttystatus.ByteSize('total'))
            ts.add(ttystatus.Literal(' ('))
            ts.add(ttystatus.PercentDone('done', 'total'))
            ts.add(ttystatus.Literal('), group '))
            ts.add(ttystatus.Counter('size'))
            ts.add(ttystatus.Literal('/'))
            ts.add(ttystatus.Literal(str(len(self.by_size))))
            ts.add(ttystatus.Literal(' ('))
            ts.add(ttystatus.ByteSize('size'))
            ts.add(ttystatus.Literal(')'))

        result = []
        ith = 0
        for size, tuples in sorted(self.by_size.iteritems()):
            ith += 1
            if len(set((dev, ino) for dev, ino, pathname in tuples)) > 1:
                # All files are not hardlinks to the same inode.
                # (This also excludes groups with just one file.)
                result += self.find_duplicates([p for d, i, p in tuples])
            ts['size'] = size
            ts['done'] += len(tuples) * size
            ts['total'] = total_bytes

        if self.progress:
            ts.finish()

        return result

    def find_duplicates(self, pathnames):
        '''Find sets of duplicate files.
        
        Return list of groups of identical files, where each group is a list
        of pathnames to files that are identical.
        
        '''
        
        result = []
        identical_groups = [[(x, File(x)) for x in pathnames]]

        while identical_groups:
            new = []
            for group in identical_groups:
                done, not_done = self.read_next_chunks(group)
                if len(done) > 1:
                    result.append(done)
                while not_done:
                    key = not_done[0][2]
                    temp2 =    [t for t in not_done if t[2] == key]
                    not_done = [t for t in not_done if t[2] != key]
                    if len(temp2) > 1:
                        new.append([(pathname, f) 
                                    for pathname, f, data in temp2])
            identical_groups = new

        return result

    def read_next_chunks(self, group):
        '''Read next chunk of data in each file.
        
        group is a list of tuples (pathname, open file). A chunk of data
        is read from each file.
        
        Return value is two lists: one with filenames that reached the
        end of the file, and one with tuples (pathname, open file,
        chunk).
        
        '''
        
        done = []
        not_done = []
        chunk_size = 4 * 1024
        for pathname, f in group:
            data = f.read(chunk_size)
            if not data:
                f.close()
                done.append(pathname)
            else:
                not_done.append((pathname, f, data))
        return done, not_done
    
    
class Dupfiles(cliapp.Application):

    def add_settings(self):
        self.settings.boolean(['make-hardlinks'], 
                              'hardlink duplicate files to each other')
        self.settings.boolean(['progress'], 'report progress')
        self.settings.boolean(['remove'],
                              'remove all but one copy of identical files')

    def process_args(self, args):
        dupfinder = DuplicateFileFinder(self.settings['progress'])
        for dirname in sorted(args):
            dupfinder.collect(dirname)

        for duplicates in dupfinder.duplicates():
            if self.settings['make-hardlinks']:
                self.make_hardlinks(duplicates)
            elif self.settings['remove']:
                self.remove_all_but_one(duplicates)
            else:
                self.report(duplicates)

    def get_meta(self, name):
        st = os.lstat(name)
        return st.st_uid, st.st_gid, st.st_mode

    def make_hardlinks(self, duplicates):
        dups = [(name, self.get_meta(name)) for name in duplicates]
        
        while dups:
            canonical, meta = dups[0]
            dups = dups[1:]
            same = [x for x, y in dups if y == meta]
            dups = [(x,y) for x, y in dups if y != meta]
            for pathname in same:
                os.remove(pathname)
                os.link(canonical, pathname)
    
    def remove_all_but_one(self, duplicates):
        keep = duplicates.pop()
        for pathname in duplicates:
            os.remove(pathname)

    def report(self, duplicates):
        sys.stdout.write('\n'.join(duplicates))
        sys.stdout.write('\n\n')


if __name__ == '__main__':
    Dupfiles(version=version).run()