From 534ad38e3aae8354bd6ede94fe7b1b7259a598ec Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 6 Feb 2021 18:03:49 +0200 Subject: feat: method for listing ids of chunks from file data This will be useful soon, to enable us to check how many chunks from file data there is in the repository, to check that an upcoming chunk size setting works. Also add an API call for returning the ids. Note that all of this is meant for testing only. It may be best to disable it in production builds, eventually. --- src/bin/obnam-server.rs | 2 ++ src/index.rs | 15 +++++++++++++++ src/indexedstore.rs | 28 +++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/bin/obnam-server.rs b/src/bin/obnam-server.rs index 19f2e99..0e9d4e6 100644 --- a/src/bin/obnam-server.rs +++ b/src/bin/obnam-server.rs @@ -188,6 +188,8 @@ pub async fn search_chunks( } if key == "generation" && value == "true" { store.find_generations().expect("SQL lookup failed") + } else if key == "data" && value == "true" { + store.find_file_chunks().expect("SQL lookup failed") } else if key == "sha256" { store.find_by_sha256(value).expect("SQL lookup failed") } else { diff --git a/src/index.rs b/src/index.rs index f7300da..9386e73 100644 --- a/src/index.rs +++ b/src/index.rs @@ -75,6 +75,10 @@ impl Index { pub fn find_generations(&self) -> IndexResult> { sql::find_generations(&self.conn) } + + pub fn all_chunks(&self) -> IndexResult> { + sql::find_chunk_ids(&self.conn) + } } #[cfg(test)] @@ -243,6 +247,17 @@ mod sql { Ok(ids) } + pub fn find_chunk_ids(conn: &Connection) -> IndexResult> { + let mut stmt = conn.prepare("SELECT id FROM chunks WHERE generation IS 0")?; + let iter = stmt.query_map(params![], |row| row_to_id(row))?; + let mut ids = vec![]; + for x in iter { + let x = x?; + ids.push(x); + } + Ok(ids) + } + fn row_to_meta(row: &Row) -> rusqlite::Result { let sha256: String = row.get(row.column_index("sha256")?)?; let generation: i32 = row.get(row.column_index("generation")?)?; diff --git a/src/indexedstore.rs b/src/indexedstore.rs index 3f347dd..f2d1831 100644 --- a/src/indexedstore.rs +++ b/src/indexedstore.rs @@ -1,8 +1,9 @@ -use crate::chunk::DataChunk; +use crate::chunk::{DataChunk, GenerationChunk, GenerationChunkError}; use crate::chunkid::ChunkId; use crate::chunkmeta::ChunkMeta; use crate::index::{Index, IndexError}; use crate::store::{Store, StoreError}; +use std::collections::HashSet; use std::path::Path; /// A store for chunks and their metadata. @@ -21,6 +22,9 @@ pub enum IndexedError { #[error(transparent)] IndexError(#[from] IndexError), + #[error(transparent)] + GenerationChunkError(#[from] GenerationChunkError), + /// An error from Store. #[error(transparent)] SqlError(#[from] StoreError), @@ -64,6 +68,28 @@ impl IndexedStore { Ok(self.index.find_generations()?) } + pub fn find_file_chunks(&self) -> IndexedResult> { + let gen_ids = self.find_generations()?; + + let mut sql_chunks: HashSet = HashSet::new(); + for id in gen_ids { + let gen_chunk = self.store.load(&id)?; + let gen = GenerationChunk::from_data_chunk(&gen_chunk)?; + for sqlite_chunk_id in gen.chunk_ids() { + sql_chunks.insert(sqlite_chunk_id.clone()); + } + } + + let all_chunk_ids = self.index.all_chunks()?; + let file_chunks = all_chunk_ids + .iter() + .filter(|id| !sql_chunks.contains(id)) + .map(|id| id.clone()) + .collect(); + + Ok(file_chunks) + } + pub fn remove(&mut self, id: &ChunkId) -> IndexedResult<()> { self.index.remove_meta(id)?; self.store.delete(id)?; -- cgit v1.2.1 From f1d1636beeddd56635f248bd0eb2b5841c65f562 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 6 Feb 2021 18:03:49 +0200 Subject: feat: make client config fields be optional, have defaults We don't want to require the user to have to specify every possible setting in client configuration files. Having reasonable defaults when possible is a better way. --- src/client.rs | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/src/client.rs b/src/client.rs index 7cd6df7..7a4ce21 100644 --- a/src/client.rs +++ b/src/client.rs @@ -17,16 +17,35 @@ use std::fs::File; use std::io::prelude::*; use std::path::{Path, PathBuf}; +const DEFAULT_CHUNK_SIZE: usize = 1024 * 1024; +const DEVNULL: &str = "/dev/null"; + +#[derive(Debug, Serialize, Deserialize, Clone)] +struct TentativeClientConfig { + server_url: String, + verify_tls_cert: Option, + chunk_size: Option, + root: PathBuf, + log: Option, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ClientConfig { pub server_url: String, pub verify_tls_cert: bool, + pub chunk_size: usize, pub root: PathBuf, - pub log: Option, + pub log: PathBuf, } #[derive(Debug, thiserror::Error)] pub enum ClientConfigError { + #[error("server_url is empty")] + ServerUrlIsEmpty, + + #[error("backup root is unset or empty")] + NoBackupRoot, + #[error("server URL doesn't use https: {0}")] NotHttps(String), @@ -43,15 +62,30 @@ impl ClientConfig { pub fn read_config(filename: &Path) -> ClientConfigResult { trace!("read_config: filename={:?}", filename); let config = std::fs::read_to_string(filename)?; - let config: ClientConfig = serde_yaml::from_str(&config)?; + let tentative: TentativeClientConfig = serde_yaml::from_str(&config)?; + + let config = ClientConfig { + server_url: tentative.server_url, + root: tentative.root, + verify_tls_cert: tentative.verify_tls_cert.or(Some(false)).unwrap(), + chunk_size: tentative.chunk_size.or(Some(DEFAULT_CHUNK_SIZE)).unwrap(), + log: tentative.log.or(Some(PathBuf::from(DEVNULL))).unwrap(), + }; + config.check()?; Ok(config) } fn check(&self) -> Result<(), ClientConfigError> { + if self.server_url.is_empty() { + return Err(ClientConfigError::ServerUrlIsEmpty); + } if !self.server_url.starts_with("https://") { return Err(ClientConfigError::NotHttps(self.server_url.to_string())); } + if self.root.to_string_lossy().is_empty() { + return Err(ClientConfigError::NoBackupRoot); + } Ok(()) } } -- cgit v1.2.1 From ad98db921aa3d710ad7c448c6a1b818f4359d73a Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 6 Feb 2021 18:03:49 +0200 Subject: feat: use the chunk size setting from the client configuration Use the chunk_size setting for file data. For the SQLite file, use a hard-coded size instead. --- src/backup_run.rs | 4 ++-- src/bin/obnam.rs | 8 ++------ src/cmd/backup.rs | 10 +++++++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/backup_run.rs b/src/backup_run.rs index 05d5988..fce9a73 100644 --- a/src/backup_run.rs +++ b/src/backup_run.rs @@ -30,14 +30,14 @@ pub enum BackupError { pub type BackupResult = Result; impl BackupRun { - pub fn new(config: &ClientConfig, buffer_size: usize) -> BackupResult { + pub fn new(config: &ClientConfig) -> BackupResult { let client = BackupClient::new(config)?; let policy = BackupPolicy::new(); let progress = BackupProgress::new(); Ok(Self { client, policy, - buffer_size, + buffer_size: config.chunk_size, progress, }) } diff --git a/src/bin/obnam.rs b/src/bin/obnam.rs index 9c5d3f4..8778a73 100644 --- a/src/bin/obnam.rs +++ b/src/bin/obnam.rs @@ -6,8 +6,6 @@ use obnam::cmd::{backup, get_chunk, list, list_files, restore, show_config, show use std::path::{Path, PathBuf}; use structopt::StructOpt; -const BUFFER_SIZE: usize = 1024 * 1024; - fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); let config_file = match opt.config { @@ -15,15 +13,13 @@ fn main() -> anyhow::Result<()> { Some(ref path) => path.to_path_buf(), }; let config = ClientConfig::read_config(&config_file)?; - if let Some(ref log) = config.log { - setup_logging(&log)?; - } + setup_logging(&config.log)?; info!("client starts"); debug!("{:?}", opt); let result = match opt.cmd { - Command::Backup => backup(&config, BUFFER_SIZE), + Command::Backup => backup(&config), Command::List => list(&config), Command::ShowGeneration { gen_id } => show_generation(&config, &gen_id), Command::ListFiles { gen_id } => list_files(&config, &gen_id), diff --git a/src/cmd/backup.rs b/src/cmd/backup.rs index a43a622..fd1d876 100644 --- a/src/cmd/backup.rs +++ b/src/cmd/backup.rs @@ -7,10 +7,12 @@ use log::info; use std::time::SystemTime; use tempfile::NamedTempFile; -pub fn backup(config: &ClientConfig, buffer_size: usize) -> Result<(), ObnamError> { +const SQLITE_CHUNK_SIZE: usize = 1024 * 1024; + +pub fn backup(config: &ClientConfig) -> Result<(), ObnamError> { let runtime = SystemTime::now(); - let run = BackupRun::new(config, buffer_size)?; + let run = BackupRun::new(config)?; // Create a named temporary file. We don't meed the open file // handle, so we discard that. @@ -52,7 +54,9 @@ pub fn backup(config: &ClientConfig, buffer_size: usize) -> Result<(), ObnamErro // Upload the SQLite file, i.e., the named temporary file, which // still exists, since we persisted it above. - let gen_id = run.client().upload_generation(&newname, buffer_size)?; + let gen_id = run + .client() + .upload_generation(&newname, SQLITE_CHUNK_SIZE)?; println!("status: OK"); println!("duration: {}", runtime.elapsed()?.as_secs()); println!("file-count: {}", file_count); -- cgit v1.2.1 From 61aca01941d3bdf324a207f2b53dbf7128169142 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 6 Feb 2021 18:03:49 +0200 Subject: test: add scenario for checking chunk-size --- obnam.md | 27 +++++++++++++++++++++++++++ subplot/data.py | 15 +++++++++------ subplot/data.yaml | 4 +--- subplot/server.py | 21 +++++++++++++-------- subplot/server.yaml | 3 +++ 5 files changed, 53 insertions(+), 17 deletions(-) diff --git a/obnam.md b/obnam.md index b41a118..34535a5 100644 --- a/obnam.md +++ b/obnam.md @@ -771,6 +771,10 @@ The server has the following API for managing chunks: * `GET /chunks?sha256=xyzzy` – find chunks on the server whose metadata indicates their contents has a given SHA256 checksum * `GET /chunks?generation=true` – find generation chunks +* `GET /chunks?data=True` – find chunks with file data + - this is meant for testing only + - it excludes generation chunks, and chunks used to store the + generation's SQLite file HTTP status codes are used to indicate if a request succeeded or not, using the customary meanings. @@ -858,6 +862,7 @@ when I POST data.dat to /chunks, with chunk-meta: {"sha256":"abc"} then HTTP status code is 201 and content-type is application/json and the JSON body has a field chunk_id, henceforth ID +and server has 1 file chunks ~~~ We must be able to retrieve it. @@ -1145,6 +1150,28 @@ given a manifest of the directory live restored in rest in rest.yaml then files live.yaml and rest.yaml match ~~~ +## Set chunk size + +This scenario verifies that the user can set the chunk size in the +configuration file. The chunk size only affects the chunks of live +data. + +~~~scenario +given an installed obnam +given a running chunk server +given a client config based on tiny-chunk-size.yaml +given a file live/data.dat containing "abc" +when I run obnam --config tiny-chunk-size.yaml backup +then server has 3 file chunks +~~~ + +~~~{#tiny-chunk-size.yaml .file .yaml .numberLines} +verify_tls_cert: false +root: live +chunk_size: 1 +~~~ + + ## Backup or not for the right reason The decision of whether to back up a file or keep the version in the diff --git a/subplot/data.py b/subplot/data.py index 2a54037..f3faf2b 100644 --- a/subplot/data.py +++ b/subplot/data.py @@ -5,14 +5,17 @@ import random import yaml -def create_file_with_random_data(ctx, filename=None): - N = 128 - data = "".join(chr(random.randint(0, 255)) for i in range(N)).encode("UTF-8") +def create_file_with_given_data(ctx, filename=None, data=None): + logging.debug(f"creating file {filename} with {data!r}") dirname = os.path.dirname(filename) or "." - logging.debug(f"create_file_with_random_data: dirname={dirname}") os.makedirs(dirname, exist_ok=True) - with open(filename, "wb") as f: - f.write(data) + open(filename, "wb").write(data.encode("UTF-8")) + + +def create_file_with_random_data(ctx, filename=None): + N = 128 + data = "".join(chr(random.randint(0, 255)) for i in range(N)) + create_file_with_given_data(ctx, filename=filename, data=data) def create_nonutf8_filename(ctx, dirname=None): diff --git a/subplot/data.yaml b/subplot/data.yaml index 6d384b8..9538daa 100644 --- a/subplot/data.yaml +++ b/subplot/data.yaml @@ -1,6 +1,4 @@ -- given: > - a file (?P\\S+) containing "(?P.*)" - regex: true +- given: a file {filename} containing "{data:text}" function: create_file_with_given_data - given: "a file {filename} containing some random data" diff --git a/subplot/server.py b/subplot/server.py index 289e181..df594f7 100644 --- a/subplot/server.py +++ b/subplot/server.py @@ -5,8 +5,6 @@ import random import re import requests import shutil -import socket -import time import urllib3 import yaml @@ -35,7 +33,9 @@ def start_chunk_server(ctx): "address": f"localhost:{port}", } - server_binary = os.path.abspath(os.path.join(srcdir, "target", "debug", "obnam-server")) + server_binary = os.path.abspath( + os.path.join(srcdir, "target", "debug", "obnam-server") + ) filename = "config.yaml" yaml.safe_dump(config, stream=open(filename, "w")) @@ -44,11 +44,7 @@ def start_chunk_server(ctx): ctx["server_url"] = f"https://{config['address']}" daemon_start_on_port( - ctx, - name="obnam-server", - path=server_binary, - args=filename, - port=port, + ctx, name="obnam-server", path=server_binary, args=filename, port=port ) @@ -138,6 +134,15 @@ def json_body_matches(ctx, wanted=None): assert_eq(body.get(key, "not.there"), wanted[key]) +def server_has_n_file_chunks(ctx, n=None): + assert_eq = globals()["assert_eq"] + n = int(n) + url = f"{ctx['server_url']}/chunks?data=true" + _request(ctx, requests.get, url) + num_chunks = len(ctx["http.json"]) + assert_eq(n, num_chunks) + + # Make an HTTP request. def _request(ctx, method, url, headers=None, data=None): r = method(url, headers=headers, data=data, verify=False) diff --git a/subplot/server.yaml b/subplot/server.yaml index 68f8f0c..60f8a44 100644 --- a/subplot/server.yaml +++ b/subplot/server.yaml @@ -43,3 +43,6 @@ - then: "the body matches file {filename}" function: body_matches_file + +- then: "server has {n:int} file chunks" + function: server_has_n_file_chunks -- cgit v1.2.1