From 92dc0a5dc50ea0c10a8b6120af60f94b196408d3 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 18 Apr 2010 14:26:32 +1200 Subject: Do not read the whole file into memory when doing a checksum. --- dupfiles | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dupfiles b/dupfiles index 0f9dd48..d95237c 100755 --- a/dupfiles +++ b/dupfiles @@ -115,7 +115,15 @@ class DuplicateFileFinder(object): return '0 B' def file_checksum(self, pathname): - return hashlib.md5(file(pathname, 'rb').read()).digest() + cs = hashlib.md5() + f = file(pathname, 'rb') + while True: + data = f.read(64*1024) + if not data: + break + cs.update(data) + f.close() + return cs.digest() def make_hardlinks(duplicates): -- cgit v1.2.1