From 25f74b1cd85ea624e6dd588b3ba19f4738d5c2d4 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 18 Apr 2010 09:56:53 +1200 Subject: Bugfix: store dev/ino for each pathname, not just for first file with a given size. --- dupfiles | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/dupfiles b/dupfiles index 5474be9..be36f0c 100755 --- a/dupfiles +++ b/dupfiles @@ -59,13 +59,15 @@ class DuplicateFileFinder(object): pathnames = [os.path.join(dirname, f) for f in filenames] for pathname in pathnames: stat = os.stat(pathname) + t = (stat.st_dev, stat.st_ino, pathname) if stat.st_size in self.by_size: - dev, ino, pathnames = self.by_size[stat.st_size] - if stat.st_dev != dev or stat.st_ino != ino: - pathnames.add(pathname) + for dev, ino, pathname in self.by_size[stat.st_size]: + if stat.st_dev == dev and stat.st_ino == ino: + break + else: + self.by_size[stat.st_size].append(t) else: - self.by_size[stat.st_size] = (stat.st_dev, stat.st_ino, - set([pathname])) + self.by_size[stat.st_size] = [t] self.progress.finished() # FIXME: This computed the checksum for each file, the full file. @@ -75,12 +77,12 @@ class DuplicateFileFinder(object): # groups. If a group becomes just a single file, that file is not # a duplicate and can be dropped completely. def duplicates(self): - total_files = sum(len(x[2]) for x in self.by_size.itervalues()) + total_files = sum(len(x) for x in self.by_size.itervalues()) done_files = 0 result = [] - for dev, ino, pathnames in self.by_size.itervalues(): + for tuples in self.by_size.itervalues(): by_checksum = dict() - for pathname in pathnames: + for dev, ino, pathname in tuples: checksum = self.file_checksum(pathname) if checksum not in by_checksum: by_checksum[checksum] = set() @@ -124,6 +126,7 @@ def main(): dupfinder = DuplicateFileFinder(progress) for dirname in sorted(args): dupfinder.collect(dirname) + for duplicates in dupfinder.duplicates(): if opts.make_hardlinks: make_hardlinks(duplicates) -- cgit v1.2.1