From bfaa23c896df72b8b34ad70c723d05a206a56c19 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sun, 11 Jun 2017 18:27:14 +0300 Subject: Fix: Make "obnam forget" for GA delete unused chunks --- obnamlib/backup_progress.py | 7 ++-- obnamlib/backup_progress_tests.py | 2 +- obnamlib/fmt_ga/chunk_store.py | 21 ++++++++++- obnamlib/fmt_ga/cowtree.py | 29 +++++++++++++++- obnamlib/fmt_ga/cowtree_tests.py | 29 +++++++++++++++- obnamlib/fmt_ga/indexes.py | 69 +++++++++++++++++++++++++++++++++---- obnamlib/fmt_ga/leaf.py | 6 +++- obnamlib/fmt_ga/leaf_store.py | 15 +++++++- obnamlib/fmt_ga/leaf_store_tests.py | 8 ++++- obnamlib/fmt_ga/leaf_tests.py | 8 ++++- obnamlib/plugins/forget_plugin.py | 1 - pylint.conf | 1 + test-ga-forget | 34 ++++++++++++++++++ 13 files changed, 212 insertions(+), 18 deletions(-) create mode 100755 test-ga-forget diff --git a/obnamlib/backup_progress.py b/obnamlib/backup_progress.py index 026dd604..4927c49f 100644 --- a/obnamlib/backup_progress.py +++ b/obnamlib/backup_progress.py @@ -1,4 +1,4 @@ -# Copyright (C) 2009-2015 Lars Wirzenius +# Copyright (C) 2009-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -116,7 +116,7 @@ class BackupProgress(object): def report_stats(self, output, fs, quiet, report=None): # pragma: no cover if report is None: report = self.compute_report(fs) - + duration_string = obnamlib.humanise_duration(report['duration']) chunk_amount, chunk_unit = obnamlib.humanise_size( @@ -134,7 +134,8 @@ class BackupProgress(object): overhead_bytes = max(0, overhead_bytes) overhead_amount, overhead_unit = obnamlib.humanise_size(overhead_bytes) if report['uploaded-total-bytes'] > 0: - overhead_percent = 100.0 * overhead_bytes / report['uploaded-total-bytes'] + overhead_percent = ( + 100.0 * overhead_bytes / report['uploaded-total-bytes']) else: overhead_percent = 0.0 diff --git a/obnamlib/backup_progress_tests.py b/obnamlib/backup_progress_tests.py index c3d2dc01..7aba84b1 100644 --- a/obnamlib/backup_progress_tests.py +++ b/obnamlib/backup_progress_tests.py @@ -103,7 +103,7 @@ class DummyTerminalStatus(object): def error(self, msg): pass - + def __setitem__(self, key, value): pass diff --git a/obnamlib/fmt_ga/chunk_store.py b/obnamlib/fmt_ga/chunk_store.py index 9f916276..da1ed4bd 100644 --- a/obnamlib/fmt_ga/chunk_store.py +++ b/obnamlib/fmt_ga/chunk_store.py @@ -1,4 +1,4 @@ -# Copyright 2015 Lars Wirzenius +# Copyright 2015,2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -65,6 +65,25 @@ class GAChunkStore(object): filename=None) return content + def get_bag_id(self, chunk_id): + bag_id, _ = obnamlib.parse_object_id(chunk_id) + return bag_id + + def get_chunks_in_bag(self, bag_id): + try: + bag = self._bag_store.get_bag(bag_id) + except EnvironmentError: + pass + else: + for i in range(len(bag)): + yield obnamlib.make_object_id(bag_id, i) + + def remove_bag(self, bag_id): + try: + self._bag_store.remove_bag(bag_id) + except EnvironmentError: + pass + def has_chunk(self, chunk_id): # This is ugly, 'cause it requires reading in the whole bag. # We could easily check if the bag exists, but not whether it diff --git a/obnamlib/fmt_ga/cowtree.py b/obnamlib/fmt_ga/cowtree.py index f1f74385..abfa27b7 100644 --- a/obnamlib/fmt_ga/cowtree.py +++ b/obnamlib/fmt_ga/cowtree.py @@ -1,4 +1,4 @@ -# Copyright 2016 Lars Wirzenius +# Copyright 2016-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -69,7 +69,12 @@ class CowTree(object): self._leaf_list.update_leaf(leaf_id, keys[0], keys[-1]) if len(leaf) > self._max_keys_per_leaf: self._leaf_list.drop_leaf(leaf_id) + self._store.remove_leaf(leaf_id) self._split_leaf(leaf) + else: + self._leaf_list.drop_leaf(leaf_id) + self._make_split_leaf(leaf, list(sorted(leaf.keys()))) + self._store.remove_leaf(leaf_id) def _split_leaf(self, leaf): sorted_keys = list(sorted(leaf.keys())) @@ -85,6 +90,24 @@ class CowTree(object): new_id = self._store.put_leaf(new) self._leaf_list.insert_leaf(sorted_keys[0], sorted_keys[-1], new_id) + def remove(self, key): + leaf_id = self._leaf_list.find_leaf_for_key(key) + if leaf_id is not None: + self._leaf_list.drop_leaf(leaf_id) + leaf = self._store.get_leaf(leaf_id) + leaf.remove(key) + self._store.remove_leaf(leaf_id) + if len(leaf) > 0: + self._make_split_leaf(leaf, list(sorted(leaf.keys()))) + + def keys(self): + leaf_ids = list(self._leaf_list.leaf_ids()) + if leaf_ids: + for leaf_id in leaf_ids: + leaf = self._store.get_leaf(leaf_id) + for key in leaf.keys(): + yield key + def commit(self): fake_leaf = obnamlib.CowLeaf() fake_leaf.insert('leaf_list', self._leaf_list.as_dict()) @@ -106,6 +129,10 @@ class _LeafList(object): def from_dict(self, some_dict): self._leaf_list = some_dict + def leaf_ids(self): + for leaf_info in self._leaf_list: + yield leaf_info['id'] + def find_leaf_for_key(self, key): # If there are no leaves, we can't pick one for key. if not self._leaf_list: diff --git a/obnamlib/fmt_ga/cowtree_tests.py b/obnamlib/fmt_ga/cowtree_tests.py index b34ce2f0..1928a607 100644 --- a/obnamlib/fmt_ga/cowtree_tests.py +++ b/obnamlib/fmt_ga/cowtree_tests.py @@ -1,4 +1,4 @@ -# Copyright 2016 Lars Wirzenius +# Copyright 2016-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -28,6 +28,9 @@ class CowTreeTests(unittest.TestCase): self.cow = obnamlib.CowTree() self.cow.set_leaf_store(self.ls) + def test_has_no_keys_initially(self): + self.assertEqual(list(self.cow.keys()), []) + def test_lookup_returns_none_if_key_is_missing(self): self.assertEqual(self.cow.lookup(42), None) @@ -36,6 +39,24 @@ class CowTreeTests(unittest.TestCase): value = 'barvalue' self.cow.insert(key, value) self.assertEqual(self.cow.lookup(key), value) + self.assertEqual(list(self.cow.keys()), [key]) + + def test_removes_only_key_in_blob(self): + key = 'fookey' + value = 'barvalue' + self.cow.insert(key, value) + self.cow.remove(key) + self.assertEqual(list(self.cow.keys()), []) + + def test_removes_one_of_the_keys_in_blob(self): + key = 'fookey' + value = 'barvalue' + key2 = 'fookey2' + value2 = 'barvalue2' + self.cow.insert(key, value) + self.cow.insert(key2, value2) + self.cow.remove(key) + self.assertEqual(list(self.cow.keys()), [key2]) def test_inserts_many_keys(self): N = 10 @@ -64,3 +85,9 @@ class CowTreeTests(unittest.TestCase): cow2.set_leaf_store(self.ls) cow2.set_list_node(list_id) self.assertEqual(cow2.lookup(key), value) + + def test_iterates_over_leaf_keys(self): + key = 'fookey' + value = 'barvalue' + self.cow.insert(key, value) + self.assertEqual(list(self.cow.keys()), [key]) diff --git a/obnamlib/fmt_ga/indexes.py b/obnamlib/fmt_ga/indexes.py index b57b3877..7660b214 100644 --- a/obnamlib/fmt_ga/indexes.py +++ b/obnamlib/fmt_ga/indexes.py @@ -1,4 +1,4 @@ -# Copyright 2015-2016 Lars Wirzenius +# Copyright 2015-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ # =*= License: GPL-3+ =*= +import logging import os import obnamlib @@ -126,6 +127,11 @@ class GAChunkIndexes(object): def put_chunk_into_indexes(self, chunk_id, token, client_id): self._load_data() + client_ids = self._used_by_tree.lookup(chunk_id) + logging.debug( + 'xxx before adding to used-by: chunk %r is used by %r', + chunk_id, client_ids) + self._by_chunk_id_tree.insert(chunk_id, token) chunk_ids = self._by_checksum_tree.lookup(token) @@ -135,12 +141,14 @@ class GAChunkIndexes(object): chunk_ids.append(chunk_id) self._by_checksum_tree.insert(token, chunk_ids) - client_ids = self._used_by_tree.lookup(chunk_id) if client_ids is None: client_ids = [client_id] elif client_id not in client_ids: client_ids.append(client_id) self._used_by_tree.insert(chunk_id, client_ids) + logging.debug( + 'xxx after adding to used-by: chunk %r is used by %r', + chunk_id, client_ids) def find_chunk_ids_by_token(self, token): self._load_data() @@ -164,6 +172,9 @@ class GAChunkIndexes(object): def _remove_used_by(self, chunk_id, client_id): still_used = False client_ids = self._used_by_tree.lookup(chunk_id) + logging.debug( + 'xxx before removing used_by: chunk %r is used by %r', + chunk_id, client_ids) if client_ids is not None and client_id in client_ids: client_ids.remove(client_id) self._used_by_tree.insert(chunk_id, client_ids) @@ -173,13 +184,15 @@ class GAChunkIndexes(object): # We leave an empty list, and use that in # remove_unused_chunks to indicate an unused chunk. pass + logging.debug( + 'xxx after removing used_by: chunk %r is used by %r', + chunk_id, client_ids) return still_used def _remove_chunk_by_id(self, chunk_id): token = self._by_chunk_id_tree.lookup(chunk_id) if token is not None: - # FIXME: Should we have CowTree.delete(key)? - self._by_chunk_id_tree.insert(chunk_id, None) + self._by_chunk_id_tree.remove(chunk_id) return token def _remove_chunk_by_checksum(self, chunk_id, token): @@ -192,8 +205,52 @@ class GAChunkIndexes(object): self._used_by_tree.insert(chunk_id, None) def remove_unused_chunks(self, chunk_store): - # FIXME: This requires having a way to list keys in a CowTree. - pass + unused_chunks = self.get_unused_chunks() + self.drop_unused_chunks(unused_chunks) + + maybe_unused_bags = self.get_bags_containing_chunks( + chunk_store, unused_chunks) + for bag_id in maybe_unused_bags: + chunk_ids = chunk_store.get_chunks_in_bag(bag_id) + logging.debug( + 'remove_unused_chunk: can bag %r be removed?', bag_id) + for chunk_id in chunk_ids: + logging.debug( + 'remove_unused_chunk: chunk %r, used: %r', + chunk_id, self.is_chunk_used_by_anyone(chunk_id)) + if not self.any_chunk_is_used_by_someone(chunk_ids): + logging.debug('remove_unused_chunk: remove bag %r', bag_id) + chunk_store.remove_bag(bag_id) + else: + logging.debug( + 'remove_unused_chunk: bag %r cannot be removed', bag_id) + + def get_unused_chunks(self): + return [ + chunk_id + for chunk_id in self._used_by_tree.keys() + if not self.is_chunk_used_by_anyone(chunk_id) + ] + + def is_chunk_used_by_anyone(self, chunk_id): + return self._used_by_tree.lookup(chunk_id) not in (None, []) + + def drop_unused_chunks(self, chunk_ids): + # By this time we know the chunk is only in the used_by tree. + for chunk_id in chunk_ids: + self._used_by_tree.remove(chunk_id) + + def get_bags_containing_chunks(self, chunk_store, chunk_ids): + return set( + chunk_store.get_bag_id(chunk_id) + for chunk_id in chunk_ids + ) + + def any_chunk_is_used_by_someone(self, chunk_ids): + return any( + self.is_chunk_used_by_anyone(chunk_id) + for chunk_id in chunk_ids + ) def validate_chunk_content(self, chunk_id): return None diff --git a/obnamlib/fmt_ga/leaf.py b/obnamlib/fmt_ga/leaf.py index 6522a6a9..7f293cc3 100644 --- a/obnamlib/fmt_ga/leaf.py +++ b/obnamlib/fmt_ga/leaf.py @@ -1,4 +1,4 @@ -# Copyright 2016 Lars Wirzenius +# Copyright 2016-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -36,6 +36,10 @@ class CowLeaf(object): def insert(self, key, value): self._dict[key] = value + def remove(self, key): + if key in self._dict: + del self._dict[key] + def as_dict(self): return copy.deepcopy(self._dict) diff --git a/obnamlib/fmt_ga/leaf_store.py b/obnamlib/fmt_ga/leaf_store.py index 5d488e96..56b08faa 100644 --- a/obnamlib/fmt_ga/leaf_store.py +++ b/obnamlib/fmt_ga/leaf_store.py @@ -1,4 +1,4 @@ -# Copyright 2016 Lars Wirzenius +# Copyright 2016-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -27,6 +27,9 @@ class LeafStoreInterface(object): # pragma: no cover def get_leaf(self, leaf_id): raise NotImplementedError() + def remove_leaf(self, leaf_id): + raise NotImplementedError() + def flush(self): raise NotImplementedError() @@ -45,6 +48,10 @@ class InMemoryLeafStore(LeafStoreInterface): def get_leaf(self, leaf_id): return self._leaves.get(leaf_id, None) + def remove_leaf(self, leaf_id): + if leaf_id in self._leaves: + del self._leaves[leaf_id] + def flush(self): pass @@ -65,5 +72,11 @@ class LeafStore(LeafStoreInterface): # pragma: no cover leaf.from_dict(self._blob_store.get_blob(leaf_id)) return leaf + def remove_leaf(self, leaf_id): + # FIXME: This is a bit ugly, since we need to break the + # bag/blob store abstraction. + bag_id, _ = obnamlib.parse_object_id(leaf_id) + self._blob_store._bag_store.remove_bag(bag_id) + def flush(self): self._blob_store.flush() diff --git a/obnamlib/fmt_ga/leaf_store_tests.py b/obnamlib/fmt_ga/leaf_store_tests.py index 9d1a8f71..049d03e4 100644 --- a/obnamlib/fmt_ga/leaf_store_tests.py +++ b/obnamlib/fmt_ga/leaf_store_tests.py @@ -1,4 +1,4 @@ -# Copyright 2016 Lars Wirzenius +# Copyright 2016-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -40,6 +40,12 @@ class LeafStoreTests(object): def test_returns_None_if_leaf_is_missing(self): self.assertEqual(self.ls.get_leaf(42), None) + def test_removes_leaf(self): + leaf = {'foo': 'bar'} + leaf_id = self.ls.put_leaf(leaf) + self.ls.remove_leaf(leaf_id) + self.assertEqual(self.ls.get_leaf(leaf_id), None) + def test_has_flush(self): self.assertEqual(self.ls.flush(), None) diff --git a/obnamlib/fmt_ga/leaf_tests.py b/obnamlib/fmt_ga/leaf_tests.py index 6c5f2903..30324dd2 100644 --- a/obnamlib/fmt_ga/leaf_tests.py +++ b/obnamlib/fmt_ga/leaf_tests.py @@ -1,4 +1,4 @@ -# Copyright 2016 Lars Wirzenius +# Copyright 2016-2017 Lars Wirzenius # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -52,6 +52,12 @@ class CowLeafTests(unittest.TestCase): leaf.insert('foo', 'bar') self.assertEqual(leaf.keys(), ['foo']) + def test_removes_key(self): + leaf = obnamlib.CowLeaf() + leaf.insert('foo', 'bar') + leaf.remove('foo') + self.assertEqual(leaf.keys(), []) + def test_dict_round_trip(self): leaf = obnamlib.CowLeaf() leaf.insert('foo', 'bar') diff --git a/obnamlib/plugins/forget_plugin.py b/obnamlib/plugins/forget_plugin.py index d5c44530..dea759ba 100644 --- a/obnamlib/plugins/forget_plugin.py +++ b/obnamlib/plugins/forget_plugin.py @@ -103,7 +103,6 @@ class ForgetPlugin(obnamlib.ObnamPlugin): # Commit or unlock everything. self.repo.commit_client(client_name) self.repo.commit_chunk_indexes() - self.repo.remove_unused_chunks() self.repo.unlock_everything() self.app.dump_memory_profile('after committing') diff --git a/pylint.conf b/pylint.conf index 146a84ee..e21137e9 100644 --- a/pylint.conf +++ b/pylint.conf @@ -15,6 +15,7 @@ disable= missing-docstring, no-self-use, protected-access, + redefined-builtin, redefined-variable-type, star-args, too-few-public-methods, diff --git a/test-ga-forget b/test-ga-forget new file mode 100755 index 00000000..9bee8531 --- /dev/null +++ b/test-ga-forget @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2017 Lars Wirzenius + +set -eu + +obnam() +{ + env | grep OBNAM_PROFILE + ./obnam --no-default-config \ + --repository t.repo \ + --repository-format green-albatross-20160813 \ + --root t.data \ + --log t.log --log-level debug \ + "$@" +} + +rm -rf t.data t.repo t.log t.*.prof +genbackupdata --create 100M t.data +OBNAM_PROFILE=t.backup.prof obnam backup +genid="$(obnam genids)" +OBNAM_PROFILE=t.forget.prof obnam forget "$genid" + +echo +size="$(du -sm t.repo | awk '{print $1}')" +echo "Repository size: $size" +echo -n "Generations: " +obnam genids | tr '\n' ' ' +echo + +if [ "$size" -gt 1 ] +then + echo "FORGET DIDN'T REMOVE DATA" 1>&2 + exit 1 +fi -- cgit v1.2.1