diff options
author | Lars Wirzenius <liw@liw.fi> | 2021-02-06 16:42:44 +0000 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2021-02-06 16:42:44 +0000 |
commit | aa75f0d5709fb4062900db5ab5b6e2598b6af667 (patch) | |
tree | 861aa3b61e16e2a486c61eabf1bf2e27a223679e | |
parent | 375d05a8492c391b229b65a8752c4151ffd5bb20 (diff) | |
parent | 61aca01941d3bdf324a207f2b53dbf7128169142 (diff) | |
download | obnam2-aa75f0d5709fb4062900db5ab5b6e2598b6af667.tar.gz |
Merge branch 'chunk-size' into 'main'
Make chunk size configurable
Closes #44
See merge request larswirzenius/obnam!88
-rw-r--r-- | obnam.md | 27 | ||||
-rw-r--r-- | src/backup_run.rs | 4 | ||||
-rw-r--r-- | src/bin/obnam-server.rs | 2 | ||||
-rw-r--r-- | src/bin/obnam.rs | 8 | ||||
-rw-r--r-- | src/client.rs | 38 | ||||
-rw-r--r-- | src/cmd/backup.rs | 10 | ||||
-rw-r--r-- | src/index.rs | 15 | ||||
-rw-r--r-- | src/indexedstore.rs | 28 | ||||
-rw-r--r-- | subplot/data.py | 15 | ||||
-rw-r--r-- | subplot/data.yaml | 4 | ||||
-rw-r--r-- | subplot/server.py | 21 | ||||
-rw-r--r-- | subplot/server.yaml | 3 |
12 files changed, 144 insertions, 31 deletions
@@ -771,6 +771,10 @@ The server has the following API for managing chunks: * `GET /chunks?sha256=xyzzy` – find chunks on the server whose metadata indicates their contents has a given SHA256 checksum * `GET /chunks?generation=true` – find generation chunks +* `GET /chunks?data=True` – find chunks with file data + - this is meant for testing only + - it excludes generation chunks, and chunks used to store the + generation's SQLite file HTTP status codes are used to indicate if a request succeeded or not, using the customary meanings. @@ -858,6 +862,7 @@ when I POST data.dat to /chunks, with chunk-meta: {"sha256":"abc"} then HTTP status code is 201 and content-type is application/json and the JSON body has a field chunk_id, henceforth ID +and server has 1 file chunks ~~~ We must be able to retrieve it. @@ -1145,6 +1150,28 @@ given a manifest of the directory live restored in rest in rest.yaml then files live.yaml and rest.yaml match ~~~ +## Set chunk size + +This scenario verifies that the user can set the chunk size in the +configuration file. The chunk size only affects the chunks of live +data. + +~~~scenario +given an installed obnam +given a running chunk server +given a client config based on tiny-chunk-size.yaml +given a file live/data.dat containing "abc" +when I run obnam --config tiny-chunk-size.yaml backup +then server has 3 file chunks +~~~ + +~~~{#tiny-chunk-size.yaml .file .yaml .numberLines} +verify_tls_cert: false +root: live +chunk_size: 1 +~~~ + + ## Backup or not for the right reason The decision of whether to back up a file or keep the version in the diff --git a/src/backup_run.rs b/src/backup_run.rs index 05d5988..fce9a73 100644 --- a/src/backup_run.rs +++ b/src/backup_run.rs @@ -30,14 +30,14 @@ pub enum BackupError { pub type BackupResult<T> = Result<T, BackupError>; impl BackupRun { - pub fn new(config: &ClientConfig, buffer_size: usize) -> BackupResult<Self> { + pub fn new(config: &ClientConfig) -> BackupResult<Self> { let client = BackupClient::new(config)?; let policy = BackupPolicy::new(); let progress = BackupProgress::new(); Ok(Self { client, policy, - buffer_size, + buffer_size: config.chunk_size, progress, }) } diff --git a/src/bin/obnam-server.rs b/src/bin/obnam-server.rs index 19f2e99..0e9d4e6 100644 --- a/src/bin/obnam-server.rs +++ b/src/bin/obnam-server.rs @@ -188,6 +188,8 @@ pub async fn search_chunks( } if key == "generation" && value == "true" { store.find_generations().expect("SQL lookup failed") + } else if key == "data" && value == "true" { + store.find_file_chunks().expect("SQL lookup failed") } else if key == "sha256" { store.find_by_sha256(value).expect("SQL lookup failed") } else { diff --git a/src/bin/obnam.rs b/src/bin/obnam.rs index 9c5d3f4..8778a73 100644 --- a/src/bin/obnam.rs +++ b/src/bin/obnam.rs @@ -6,8 +6,6 @@ use obnam::cmd::{backup, get_chunk, list, list_files, restore, show_config, show use std::path::{Path, PathBuf}; use structopt::StructOpt; -const BUFFER_SIZE: usize = 1024 * 1024; - fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); let config_file = match opt.config { @@ -15,15 +13,13 @@ fn main() -> anyhow::Result<()> { Some(ref path) => path.to_path_buf(), }; let config = ClientConfig::read_config(&config_file)?; - if let Some(ref log) = config.log { - setup_logging(&log)?; - } + setup_logging(&config.log)?; info!("client starts"); debug!("{:?}", opt); let result = match opt.cmd { - Command::Backup => backup(&config, BUFFER_SIZE), + Command::Backup => backup(&config), Command::List => list(&config), Command::ShowGeneration { gen_id } => show_generation(&config, &gen_id), Command::ListFiles { gen_id } => list_files(&config, &gen_id), diff --git a/src/client.rs b/src/client.rs index 7cd6df7..7a4ce21 100644 --- a/src/client.rs +++ b/src/client.rs @@ -17,16 +17,35 @@ use std::fs::File; use std::io::prelude::*; use std::path::{Path, PathBuf}; +const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024; +const DEVNULL: &str = "/dev/null"; + +#[derive(Debug, Serialize, Deserialize, Clone)] +struct TentativeClientConfig { + server_url: String, + verify_tls_cert: Option<bool>, + chunk_size: Option<usize>, + root: PathBuf, + log: Option<PathBuf>, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ClientConfig { pub server_url: String, pub verify_tls_cert: bool, + pub chunk_size: usize, pub root: PathBuf, - pub log: Option<PathBuf>, + pub log: PathBuf, } #[derive(Debug, thiserror::Error)] pub enum ClientConfigError { + #[error("server_url is empty")] + ServerUrlIsEmpty, + + #[error("backup root is unset or empty")] + NoBackupRoot, + #[error("server URL doesn't use https: {0}")] NotHttps(String), @@ -43,15 +62,30 @@ impl ClientConfig { pub fn read_config(filename: &Path) -> ClientConfigResult<Self> { trace!("read_config: filename={:?}", filename); let config = std::fs::read_to_string(filename)?; - let config: ClientConfig = serde_yaml::from_str(&config)?; + let tentative: TentativeClientConfig = serde_yaml::from_str(&config)?; + + let config = ClientConfig { + server_url: tentative.server_url, + root: tentative.root, + verify_tls_cert: tentative.verify_tls_cert.or(Some(false)).unwrap(), + chunk_size: tentative.chunk_size.or(Some(DEFAULT_CHUNK_SIZE)).unwrap(), + log: tentative.log.or(Some(PathBuf::from(DEVNULL))).unwrap(), + }; + config.check()?; Ok(config) } fn check(&self) -> Result<(), ClientConfigError> { + if self.server_url.is_empty() { + return Err(ClientConfigError::ServerUrlIsEmpty); + } if !self.server_url.starts_with("https://") { return Err(ClientConfigError::NotHttps(self.server_url.to_string())); } + if self.root.to_string_lossy().is_empty() { + return Err(ClientConfigError::NoBackupRoot); + } Ok(()) } } diff --git a/src/cmd/backup.rs b/src/cmd/backup.rs index a43a622..fd1d876 100644 --- a/src/cmd/backup.rs +++ b/src/cmd/backup.rs @@ -7,10 +7,12 @@ use log::info; use std::time::SystemTime; use tempfile::NamedTempFile; -pub fn backup(config: &ClientConfig, buffer_size: usize) -> Result<(), ObnamError> { +const SQLITE_CHUNK_SIZE: usize = 1024 * 1024; + +pub fn backup(config: &ClientConfig) -> Result<(), ObnamError> { let runtime = SystemTime::now(); - let run = BackupRun::new(config, buffer_size)?; + let run = BackupRun::new(config)?; // Create a named temporary file. We don't meed the open file // handle, so we discard that. @@ -52,7 +54,9 @@ pub fn backup(config: &ClientConfig, buffer_size: usize) -> Result<(), ObnamErro // Upload the SQLite file, i.e., the named temporary file, which // still exists, since we persisted it above. - let gen_id = run.client().upload_generation(&newname, buffer_size)?; + let gen_id = run + .client() + .upload_generation(&newname, SQLITE_CHUNK_SIZE)?; println!("status: OK"); println!("duration: {}", runtime.elapsed()?.as_secs()); println!("file-count: {}", file_count); diff --git a/src/index.rs b/src/index.rs index f7300da..9386e73 100644 --- a/src/index.rs +++ b/src/index.rs @@ -75,6 +75,10 @@ impl Index { pub fn find_generations(&self) -> IndexResult<Vec<ChunkId>> { sql::find_generations(&self.conn) } + + pub fn all_chunks(&self) -> IndexResult<Vec<ChunkId>> { + sql::find_chunk_ids(&self.conn) + } } #[cfg(test)] @@ -243,6 +247,17 @@ mod sql { Ok(ids) } + pub fn find_chunk_ids(conn: &Connection) -> IndexResult<Vec<ChunkId>> { + let mut stmt = conn.prepare("SELECT id FROM chunks WHERE generation IS 0")?; + let iter = stmt.query_map(params![], |row| row_to_id(row))?; + let mut ids = vec![]; + for x in iter { + let x = x?; + ids.push(x); + } + Ok(ids) + } + fn row_to_meta(row: &Row) -> rusqlite::Result<ChunkMeta> { let sha256: String = row.get(row.column_index("sha256")?)?; let generation: i32 = row.get(row.column_index("generation")?)?; diff --git a/src/indexedstore.rs b/src/indexedstore.rs index 3f347dd..f2d1831 100644 --- a/src/indexedstore.rs +++ b/src/indexedstore.rs @@ -1,8 +1,9 @@ -use crate::chunk::DataChunk; +use crate::chunk::{DataChunk, GenerationChunk, GenerationChunkError}; use crate::chunkid::ChunkId; use crate::chunkmeta::ChunkMeta; use crate::index::{Index, IndexError}; use crate::store::{Store, StoreError}; +use std::collections::HashSet; use std::path::Path; /// A store for chunks and their metadata. @@ -21,6 +22,9 @@ pub enum IndexedError { #[error(transparent)] IndexError(#[from] IndexError), + #[error(transparent)] + GenerationChunkError(#[from] GenerationChunkError), + /// An error from Store. #[error(transparent)] SqlError(#[from] StoreError), @@ -64,6 +68,28 @@ impl IndexedStore { Ok(self.index.find_generations()?) } + pub fn find_file_chunks(&self) -> IndexedResult<Vec<ChunkId>> { + let gen_ids = self.find_generations()?; + + let mut sql_chunks: HashSet<ChunkId> = HashSet::new(); + for id in gen_ids { + let gen_chunk = self.store.load(&id)?; + let gen = GenerationChunk::from_data_chunk(&gen_chunk)?; + for sqlite_chunk_id in gen.chunk_ids() { + sql_chunks.insert(sqlite_chunk_id.clone()); + } + } + + let all_chunk_ids = self.index.all_chunks()?; + let file_chunks = all_chunk_ids + .iter() + .filter(|id| !sql_chunks.contains(id)) + .map(|id| id.clone()) + .collect(); + + Ok(file_chunks) + } + pub fn remove(&mut self, id: &ChunkId) -> IndexedResult<()> { self.index.remove_meta(id)?; self.store.delete(id)?; diff --git a/subplot/data.py b/subplot/data.py index 2a54037..f3faf2b 100644 --- a/subplot/data.py +++ b/subplot/data.py @@ -5,14 +5,17 @@ import random import yaml -def create_file_with_random_data(ctx, filename=None): - N = 128 - data = "".join(chr(random.randint(0, 255)) for i in range(N)).encode("UTF-8") +def create_file_with_given_data(ctx, filename=None, data=None): + logging.debug(f"creating file {filename} with {data!r}") dirname = os.path.dirname(filename) or "." - logging.debug(f"create_file_with_random_data: dirname={dirname}") os.makedirs(dirname, exist_ok=True) - with open(filename, "wb") as f: - f.write(data) + open(filename, "wb").write(data.encode("UTF-8")) + + +def create_file_with_random_data(ctx, filename=None): + N = 128 + data = "".join(chr(random.randint(0, 255)) for i in range(N)) + create_file_with_given_data(ctx, filename=filename, data=data) def create_nonutf8_filename(ctx, dirname=None): diff --git a/subplot/data.yaml b/subplot/data.yaml index 6d384b8..9538daa 100644 --- a/subplot/data.yaml +++ b/subplot/data.yaml @@ -1,6 +1,4 @@ -- given: > - a file (?P<filename>\\S+) containing "(?P<data>.*)" - regex: true +- given: a file {filename} containing "{data:text}" function: create_file_with_given_data - given: "a file {filename} containing some random data" diff --git a/subplot/server.py b/subplot/server.py index 289e181..df594f7 100644 --- a/subplot/server.py +++ b/subplot/server.py @@ -5,8 +5,6 @@ import random import re import requests import shutil -import socket -import time import urllib3 import yaml @@ -35,7 +33,9 @@ def start_chunk_server(ctx): "address": f"localhost:{port}", } - server_binary = os.path.abspath(os.path.join(srcdir, "target", "debug", "obnam-server")) + server_binary = os.path.abspath( + os.path.join(srcdir, "target", "debug", "obnam-server") + ) filename = "config.yaml" yaml.safe_dump(config, stream=open(filename, "w")) @@ -44,11 +44,7 @@ def start_chunk_server(ctx): ctx["server_url"] = f"https://{config['address']}" daemon_start_on_port( - ctx, - name="obnam-server", - path=server_binary, - args=filename, - port=port, + ctx, name="obnam-server", path=server_binary, args=filename, port=port ) @@ -138,6 +134,15 @@ def json_body_matches(ctx, wanted=None): assert_eq(body.get(key, "not.there"), wanted[key]) +def server_has_n_file_chunks(ctx, n=None): + assert_eq = globals()["assert_eq"] + n = int(n) + url = f"{ctx['server_url']}/chunks?data=true" + _request(ctx, requests.get, url) + num_chunks = len(ctx["http.json"]) + assert_eq(n, num_chunks) + + # Make an HTTP request. def _request(ctx, method, url, headers=None, data=None): r = method(url, headers=headers, data=data, verify=False) diff --git a/subplot/server.yaml b/subplot/server.yaml index 68f8f0c..60f8a44 100644 --- a/subplot/server.yaml +++ b/subplot/server.yaml @@ -43,3 +43,6 @@ - then: "the body matches file {filename}" function: body_matches_file + +- then: "server has {n:int} file chunks" + function: server_has_n_file_chunks |