diff options
author | Lars Wirzenius <liw@liw.fi> | 2015-11-17 17:47:51 +0000 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2015-11-17 20:38:04 +0000 |
commit | 00d06aece5ab29ecc231805184dbe63a66004e4d (patch) | |
tree | 7302721fcb5d520fd20ee6e46ab9ad0315b92fb8 | |
parent | 16ece88991f1c36986741604389c383ebc5cc129 (diff) | |
download | obnam-00d06aece5ab29ecc231805184dbe63a66004e4d.tar.gz |
Simplify find_chunk_ids_by_content
There's really no reason to treat the empty case specially.
-rw-r--r-- | obnamlib/fmt_ga/indexes.py | 106 |
1 files changed, 73 insertions, 33 deletions
diff --git a/obnamlib/fmt_ga/indexes.py b/obnamlib/fmt_ga/indexes.py index 2f204645..ef78b5b3 100644 --- a/obnamlib/fmt_ga/indexes.py +++ b/obnamlib/fmt_ga/indexes.py @@ -51,20 +51,6 @@ class GAChunkIndexes(object): filename = self._get_filename() self._fs.overwrite_file(filename, blob) - def _get_filename(self): - return os.path.join(self.get_dirname(), 'data.dat') - - def prepare_chunk_for_indexes(self, chunk_content): - return hashlib.sha512(chunk_content).hexdigest() - - def put_chunk_into_indexes(self, chunk_id, token, client_id): - self._load_data() - self._data['index'].append({ - 'chunk-id': chunk_id, - 'sha512': token, - 'client-id': client_id, - }) - def _load_data(self): if not self._data_is_loaded: filename = self._get_filename() @@ -74,20 +60,46 @@ class GAChunkIndexes(object): assert self._data is not None else: self._data = { - 'index': [], + 'by_chunk_id': { + }, + 'by_checksum': { + 'sha512': {}, + }, + 'used_by': { + }, } self._data_is_loaded = True + def _get_filename(self): + return os.path.join(self.get_dirname(), 'data.dat') + + def prepare_chunk_for_indexes(self, chunk_content): + return hashlib.sha512(chunk_content).hexdigest() + + def put_chunk_into_indexes(self, chunk_id, token, client_id): + self._load_data() + + by_chunk_id = self._data['by_chunk_id'] + by_chunk_id[chunk_id] = token + + by_checksum = self._data['by_checksum']['sha512'] + chunk_ids = by_checksum.get(token, []) + if chunk_id not in chunk_ids: + chunk_ids.append(chunk_id) + by_checksum[token] = chunk_ids + + used_by = self._data['used_by'] + client_ids = used_by.get(chunk_id, []) + if client_id not in client_ids: + client_ids.append(client_id) + used_by[chunk_id] = client_ids + def find_chunk_ids_by_content(self, chunk_content): self._load_data() - if 'index' in self._data: - token = self.prepare_chunk_for_indexes(chunk_content) - result = [ - record['chunk-id'] - for record in self._data['index'] - if record['sha512'] == token] - else: - result = [] + + token = self.prepare_chunk_for_indexes(chunk_content) + by_checksum = self._data['by_checksum']['sha512'] + result = by_checksum.get(token, []) if not result: raise obnamlib.RepositoryChunkContentNotInIndexes() @@ -95,19 +107,47 @@ class GAChunkIndexes(object): def remove_chunk_from_indexes(self, chunk_id, client_id): self._load_data() - self._data['index'] = self._filter_out( - self._data['index'], - lambda x: - x['chunk-id'] == chunk_id and x['client-id'] == client_id) - - def _filter_out(self, records, pred): - return [record for record in records if not pred(record)] + if not self._remove_used_by(chunk_id, client_id): + token = self._remove_chunk_by_id(chunk_id) + self._remove_chunk_by_checksum(chunk_id, token) def remove_chunk_from_indexes_for_all_clients(self, chunk_id): self._load_data() - self._data['index'] = self._filter_out( - self._data['index'], - lambda x: x['chunk-id'] == chunk_id) + token = self._remove_chunk_by_id(chunk_id) + self._remove_chunk_by_checksum(chunk_id, token) + self._remove_all_used_by(chunk_id) + + def _remove_used_by(self, chunk_id, client_id): + still_used = False + used_by = self._data['used_by'] + client_ids = used_by.get(chunk_id, []) + if client_id in client_ids: + client_ids.remove(client_id) + if client_ids: + still_used = True + else: + del used_by[chunk_id] + return still_used + + def _remove_chunk_by_id(self, chunk_id): + by_chunk_id = self._data['by_chunk_id'] + token = by_chunk_id.get(chunk_id, None) + if token is not None: + del by_chunk_id[chunk_id] + return token + + def _remove_chunk_by_checksum(self, chunk_id, token): + by_checksum = self._data['by_checksum']['sha512'] + chunk_ids = by_checksum.get(token, []) + if chunk_id in chunk_ids: + chunk_ids.remove(chunk_id) + if not chunk_ids: + del by_checksum[token] + + def _remove_all_used_by(self, chunk_id): + used_by = self._data['used_by'] + if chunk_id in used_by: + del used_by[chunk_id] def validate_chunk_content(self, chunk_id): return None |