Rewrite read() to fix it, simplify and pessimise it

The previous code was broken: it assumed that most chunks (all but the last one in a file) were of the same size, and that's not a valid assumption. It is almost always true, but the user may change the chunk size setting for any backup run, and then the assumption is no longer true. The new code is much simpler. It is also, however, much slower, since it has to read through everything. We can optimise this, later, but caching the chunk sizes, and even later by making it possible to find the length of a chunk without downloading it from the repository.
author: Lars Wirzenius <liw@liw.fi> 2014-02-23 18:14:07 +0000
committer: Lars Wirzenius <liw@liw.fi> 2014-02-23 18:14:07 +0000
commit: 6031e77e693bb02851737e3641076e559e8692d7 (patch)
tree: c6b63ad0f9a99e1c531fa51854cdfa92c775d3ad /obnamlib/plugins/fuse_plugin.py
parent: abfb13ca95e36c15f9b5041aba3139ddc6d39650 (diff)
download: obnam-6031e77e693bb02851737e3641076e559e8692d7.tar.gz
1 files changed, 32 insertions, 63 deletions
diff --git a/obnamlib/plugins/fuse_plugin.py b/obnamlib/plugins/fuse_plugin.py
index 38ff85fb..469ae23c 100644
--- a/obnamlib/plugins/fuse_plugin.py
+++ b/obnamlib/plugins/fuse_plugin.py
@@ -85,10 +85,6 @@ class ObnamFuseFile(object):
         tracing.trace('mode=%r', mode)
 
         self.path = path
-        self.chunkids = None
-        self.chunksize = None
-        self.lastdata = None
-        self.lastblock = None
 
         if flags & self.write_flags:
             raise IOError(errno.EROFS, 'Read only filesystem')
@@ -137,73 +133,47 @@ class ObnamFuseFile(object):
 
         gen, repopath = self.fs.get_gen_path(self.path)
 
-        # if stored inside B-tree
+        # The file's data content may be stored in the per-client B-tree.
+        # If so, we retrieve the data from there.
         contents = repo.get_file_data(gen, repopath)
         if contents is not None:
             return contents[offset:offset+length]
 
-        # stored in chunks
-        if not self.chunkids:
-            self.chunkids = repo.get_file_chunks(gen, repopath)
-
-        if len(self.chunkids) == 1:
-            if not self.lastdata:
-                self.lastdata = repo.get_chunk(self.chunkids[0])
-            return self.lastdata[offset:offset+length]
-
-        chunkdata = None
-        if not self.chunksize:
-            # take the cached value as the first guess for chunksize
-            self.chunksize = self.fs.sizecache.get(gen, self.fs.chunksize)
-            blocknum = offset/self.chunksize
-            blockoffs = offset - blocknum*self.chunksize
-
-            # read a chunk if guessed blocknum and chunksize make sense
-            if blocknum < len(self.chunkids):
-                chunkdata = repo.get_chunk(self.chunkids[blocknum])
-            else:
-                chunkdata = ''
-
-            # check if chunkdata is of expected length
-            validate = min(self.chunksize, self.metadata.st_size - blocknum*self.chunksize)
-            if validate != len(chunkdata):
-                if blocknum < len(self.chunkids)-1:
-                    # the length of all but last chunks is chunksize
-                    self.chunksize = len(chunkdata)
-                else:
-                    # guessing failed, get the length of the first chunk
-                    self.chunksize = len(repo.get_chunk(self.chunkids[0]))
-                chunkdata = None
-
-            # save correct chunksize
-            self.fs.sizecache[gen] = self.chunksize
-
-        if not chunkdata:
-            blocknum = offset/self.chunksize
-            blockoffs = offset - blocknum*self.chunksize
-            if self.lastblock == blocknum:
-                chunkdata = self.lastdata
-            else:
-                chunkdata = repo.get_chunk(self.chunkids[blocknum])
-
+        # Otherwise, the file has a list of chunks, and we need to
+        # find the right ones and return data from them. Note that we
+        # can't compute a seek: there is no guarantee all the chunks
+        # are of the same size. The user may have changed the chunk
+        # size setting between each backup run. Thus, we have to
+        # iterate over the list of chunk ids for the file, until we
+        # find the right place.
+        #
+        # This is, obviously, not good for performance.
+        #
+        # Note that previous code here did the wrong thing by assuming
+        # the chunk size was fixed, except for the last chunk for any
+        # file.
+
+        chunkids = repo.get_file_chunks(gen, repopath)
         output = []
-        while True:
-            output.append(chunkdata[blockoffs:blockoffs+length])
-            readlength = len(chunkdata) - blockoffs
-            if length > readlength and blocknum < len(self.chunkids)-1:
-                length -= readlength
-                blocknum += 1
-                blockoffs = 0
-                chunkdata = repo.get_chunk(self.chunkids[blocknum])
-            else:
-                self.lastblock = blocknum
-                self.lastdata = chunkdata
-                break
+        output_length = 0
+        chunk_pos_in_file = 0
+        for chunkid in chunkids:
+            contents = repo.get_chunk(chunkid)
+            if chunk_pos_in_file + len(contents) >= offset:
+                start = offset - chunk_pos_in_file
+                n = length - output_length
+                data = contents[start : n]
+                output.append(data)
+                output_length += len(data)
+                assert output_length <= length
+                if output_length == length:
+                    break
+            chunk_pos_in_file += len(contents)
+
         return ''.join(output)
 
     def release(self, flags):
         tracing.trace('flags=%r', flags)
-        self.lastdata = None
         return 0
 
     def fsync(self, isfsyncfile):
@@ -334,7 +304,6 @@ class ObnamFuse(fuse.Fuse):
         ObnamFuseFile.fs = self
         self.file_class = ObnamFuseFile
         self.metadatacache = {}
-        self.chunksize = self.obnam.app.settings['chunk-size']
         self.sizecache = {}
         self.rootlist = None
         self.rootstat = None
author	Lars Wirzenius <liw@liw.fi>	2014-02-23 18:14:07 +0000
committer	Lars Wirzenius <liw@liw.fi>	2014-02-23 18:14:07 +0000
commit	6031e77e693bb02851737e3641076e559e8692d7 (patch)
tree	c6b63ad0f9a99e1c531fa51854cdfa92c775d3ad /obnamlib/plugins/fuse_plugin.py
parent	abfb13ca95e36c15f9b5041aba3139ddc6d39650 (diff)
download	obnam-6031e77e693bb02851737e3641076e559e8692d7.tar.gz