Normalizer inode and device numbers. This is necessary so that

if a directory tree is moved to a new disk, the manifest doesn't change.
author: Lars Wirzenius <liw@liw.fi> 2010-01-01 18:53:55 +0200
committer: Lars Wirzenius <liw@liw.fi> 2010-01-01 18:53:55 +0200
commit: 29183aa2d6b48a93ee380d37f10773ac51308305 (patch)
tree: 915214f575092b0159fe8566c01c7367d8d32ffe
parent: c1a0664e2fe5210293ae3fb13151ce1f32855339 (diff)
download: summain-29183aa2d6b48a93ee380d37f10773ac51308305.tar.gz
2 files changed, 108 insertions, 6 deletions
diff --git a/summainlib.py b/summainlib.py
index 29ae21e..31b42a7 100644
--- a/summainlib.py
+++ b/summainlib.py
@@ -23,6 +23,57 @@ import time
 import urllib
 
 
+class NumberNormalizer(object):
+
+    '''Normalize inode and device numbers.
+    
+    When we make two manifests of the same directory tree, but the
+    tree may have been moved to another disk, the inode and device
+    numbers may be different. This should not be a cause for concern,
+    however. What is important is that if two names were hardlinked
+    to the same file before, they still are, and if they weren't,
+    they still aren't.
+    
+    To achieve this, we normalize the inode and device numbers.
+    The input files are fed to the normalizer in a deterministic
+    sequence, and the sequence defines the numbers we use. Thus,
+    if the input files have inode numbers [42, 13, 105], we produce
+    [1, 2, 3]. If one of the input numbers is repeated, that number
+    is re-used.
+    
+    This is not a perfect solution. If the second manifest has a
+    new file, it will throw off the entire remaining sequence, causing
+    a big diff. But we'll live with that.
+    
+    '''
+    
+    def __init__(self):
+        self.reset()
+
+    def get(self, input_number, numbers, next):
+        if input_number in numbers:
+            return numbers[input_number], next
+        else:
+            numbers[input_number] = next
+            return numbers[input_number], next + 1
+    
+    def get_ino(self, ino):
+        output, self.next_ino = self.get(ino, self.ino_numbers, self.next_ino)
+        return output
+    
+    def get_dev(self, dev):
+        output, self.next_dev = self.get(dev, self.dev_numbers, self.next_dev)
+        return output
+
+    def reset(self):
+        '''This is used by unit tests.'''
+        self.ino_numbers = dict()
+        self.next_ino = 1
+
+        self.dev_numbers = dict()
+        self.next_dev = 1
+
+
 class FilesystemObject(object):
 
     '''An object in the file system.
@@ -34,7 +85,7 @@ class FilesystemObject(object):
     
     '''
     
-    def __init__(self, filename, stat_result=None, sha1=None,
+    def __init__(self, filename, normalizer, stat_result=None, sha1=None,
                  open_file=None, readlink=None):
         stat_result = stat_result or os.lstat(filename)
         sha1 = sha1 or hashlib.sha1()
@@ -45,8 +96,8 @@ class FilesystemObject(object):
         self['Name'] = filename
         self['Mtime'] = self.format_time(stat_result.st_mtime)
         self['Mode'] = '%o' % stat_result.st_mode
-        self['Ino'] = '%d' % stat_result.st_ino
-        self['Dev'] = '%d' % stat_result.st_dev
+        self['Ino'] = '%d' % normalizer.get_ino(stat_result.st_ino)
+        self['Dev'] = '%d' % normalizer.get_dev(stat_result.st_dev)
         self['Nlink'] = '%d' % stat_result.st_nlink
         self['Size'] = '%d' % stat_result.st_size
         self['Uid'] = '%d' % stat_result.st_uid
diff --git a/summainlib_tests.py b/summainlib_tests.py
index 8d07dca..8068b53 100644
--- a/summainlib_tests.py
+++ b/summainlib_tests.py
@@ -76,8 +76,11 @@ class FilesystemObjectTests(unittest.TestCase):
                                  st_uid=0,
                                  st_gid=0)
 
+        self.nn = summainlib.NumberNormalizer()
+
     def new(self, name):
-        return summainlib.FilesystemObject(name, stat_result=self.st,
+        return summainlib.FilesystemObject(name, self.nn, 
+                                           stat_result=self.st,
                                            sha1=FakeSha1(),
                                            open_file=FakeOpenFile(),
                                            readlink=FakeReadlink(self))
@@ -96,10 +99,12 @@ class FilesystemObjectTests(unittest.TestCase):
         self.assertEqual(self.new('foo')['Mode'], '100644')
 
     def test_formats_inode_number_correctly(self):
-        self.assertEqual(self.new('foo')['Ino'], '12765')
+        # Note: normalization makes the result be 1.
+        self.assertEqual(self.new('foo')['Ino'], '1')
 
     def test_formats_device_number_correctly(self):
-        self.assertEqual(self.new('foo')['Dev'], '42')
+        # Note: normalization makes the result be 1.
+        self.assertEqual(self.new('foo')['Dev'], '1')
 
     def test_formats_link_count_correctly(self):
         self.assertEqual(self.new('foo')['Nlink'], '2')
@@ -133,3 +138,49 @@ class FilesystemObjectTests(unittest.TestCase):
     def test_formats_target_correctly_for_regular_file(self):
         self.assertEqual(self.new('foo')['Target'], '')
 
+
+class FilesystemObjectNormalizedNumbersTests(unittest.TestCase):
+
+    def setUp(self):
+        self.ino = 0
+        self.dev = 0
+        self.nn = summainlib.NumberNormalizer()
+        
+    def reset(self):
+        self.dev += 1
+        self.nn.reset()
+
+    def new(self, name):
+        st = FakeStatResult(st_ino=self.ino, st_dev=self.dev, st_mtime=0,
+                           st_mode=stat.S_IFREG|0, st_nlink=1, st_size=0,
+                           st_uid=0, st_gid=0)
+        self.ino += 1
+        return summainlib.FilesystemObject(name, self.nn, stat_result=st,
+                                           sha1=FakeSha1(),
+                                           open_file=FakeOpenFile(),
+                                           readlink=FakeReadlink(self))
+
+    def test_inode_numbers_are_repeatable(self):
+        a1 = self.new('foo')
+        a2 = self.new('bar')
+        self.reset()
+        b1 = self.new('foo')
+        b2 = self.new('bar')
+        self.assertEqual(a1['Dev'], b1['Dev'])
+        self.assertEqual(a1['Ino'], b1['Ino'])
+        self.assertEqual(a2['Dev'], b2['Dev'])
+        self.assertEqual(a2['Ino'], b2['Ino'])
+        
+        
+class NumberNormalizerTests(unittest.TestCase):
+
+    def setUp(self):
+        self.nn = summainlib.NumberNormalizer()
+        
+    def test_returns_1_2_3_regardless_of_input_numbers(self):
+        self.assertEqual([self.nn.get_ino(i) for i in [10, 11, 12]],
+                         [1, 2, 3])
+                         
+    def test_returns_1_1_1_when_input_number_is_repeated(self):
+        self.assertEqual([self.nn.get_ino(i) for i in [10, 10, 10]],
+                         [1, 1, 1])
author	Lars Wirzenius <liw@liw.fi>	2010-01-01 18:53:55 +0200
committer	Lars Wirzenius <liw@liw.fi>	2010-01-01 18:53:55 +0200
commit	29183aa2d6b48a93ee380d37f10773ac51308305 (patch)
tree	915214f575092b0159fe8566c01c7367d8d32ffe
parent	c1a0664e2fe5210293ae3fb13151ce1f32855339 (diff)
download	summain-29183aa2d6b48a93ee380d37f10773ac51308305.tar.gz