From 9f6ff22ff9a1b0a5a28d037846b1cecee4f2945c Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 9 Apr 2022 08:15:03 +0300 Subject: refactor: add a Literal variant to Checksum We've had fake checksums that are just arbitrary literal strings, and this change makes this more explicit. It's a preliminary change for later support for additional checksum algorithms. Sponsored-by: author --- src/checksummer.rs | 16 ++++++++++++---- src/chunk.rs | 2 +- src/client.rs | 4 +++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/checksummer.rs b/src/checksummer.rs index 50bce04..6d7303b 100644 --- a/src/checksummer.rs +++ b/src/checksummer.rs @@ -11,11 +11,19 @@ use std::fmt; /// A checksum of some data. #[derive(Debug, Clone)] pub enum Checksum { + /// An arbitrary, literal string. + Literal(String), + /// A SHA256 checksum. Sha256(String), } impl Checksum { + /// Construct a literal string. + pub fn literal(s: &str) -> Self { + Self::Literal(s.to_string()) + } + /// Compute a SHA256 checksum for a block of data. pub fn sha256(data: &[u8]) -> Self { let mut hasher = Sha256::new(); @@ -33,9 +41,9 @@ impl Checksum { impl fmt::Display for Checksum { /// Format a checksum for display. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let hash = match self { - Self::Sha256(hash) => hash, - }; - write!(f, "{}", hash) + match self { + Self::Literal(s) => write!(f, "{}", s), + Self::Sha256(hash) => write!(f, "{}", hash), + } } } diff --git a/src/chunk.rs b/src/chunk.rs index 27a3ab9..df16a98 100644 --- a/src/chunk.rs +++ b/src/chunk.rs @@ -185,7 +185,7 @@ impl ClientTrust { pub fn to_data_chunk(&self) -> Result { let json: String = serde_json::to_string(self).map_err(ClientTrustError::JsonGenerate)?; let bytes = json.as_bytes().to_vec(); - let checksum = Checksum::sha256_from_str_unchecked("client-trust"); + let checksum = Checksum::literal("client-trust"); let meta = ChunkMeta::new(&checksum); Ok(DataChunk::new(bytes, meta)) } diff --git a/src/client.rs b/src/client.rs index c5d66c1..563921d 100644 --- a/src/client.rs +++ b/src/client.rs @@ -1,5 +1,6 @@ //! Client to the Obnam server HTTP API. +use crate::checksummer::Checksum; use crate::chunk::{ ClientTrust, ClientTrustError, DataChunk, GenerationChunk, GenerationChunkError, }; @@ -195,7 +196,8 @@ impl BackupClient { } async fn find_client_trusts(&self) -> Result, ClientError> { - let body = match self.get("", &[("label", "client-trust")]).await { + let label = format!("{}", Checksum::literal("client-trust")); + let body = match self.get("", &[("label", &label)]).await { Ok((_, body)) => body, Err(err) => return Err(err), }; -- cgit v1.2.1 From d9b72ffa5485f3c253da22f09ff0a7090de7aa37 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 9 Apr 2022 11:27:14 +0300 Subject: refactor: rename Checksum to Label Label is a clearer and more accurate name for the type now that it is not just a checksum. Also, serialize a Label in tests, rather than using string literals. This is more correct, and we'll be changing serialization later. Sponsored-by: author --- src/checksummer.rs | 49 ------------------------------------------------- src/chunk.rs | 6 +++--- src/chunker.rs | 4 ++-- src/chunkid.rs | 6 +++--- src/chunkmeta.rs | 20 ++++++++++---------- src/cipher.rs | 6 +++--- src/client.rs | 4 ++-- src/index.rs | 18 +++++++++--------- src/label.rs | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 +- src/server.rs | 4 ++-- 11 files changed, 84 insertions(+), 84 deletions(-) delete mode 100644 src/checksummer.rs create mode 100644 src/label.rs diff --git a/src/checksummer.rs b/src/checksummer.rs deleted file mode 100644 index 6d7303b..0000000 --- a/src/checksummer.rs +++ /dev/null @@ -1,49 +0,0 @@ -//! Compute checksums of data. -//! -//! De-duplication of backed up data in Obnam relies on cryptographic -//! checksums. They are implemented in this module. Note that Obnam -//! does not aim to make these algorithms configurable, so only a very -//! small number of carefully chosen algorithms are supported here. - -use sha2::{Digest, Sha256}; -use std::fmt; - -/// A checksum of some data. -#[derive(Debug, Clone)] -pub enum Checksum { - /// An arbitrary, literal string. - Literal(String), - - /// A SHA256 checksum. - Sha256(String), -} - -impl Checksum { - /// Construct a literal string. - pub fn literal(s: &str) -> Self { - Self::Literal(s.to_string()) - } - - /// Compute a SHA256 checksum for a block of data. - pub fn sha256(data: &[u8]) -> Self { - let mut hasher = Sha256::new(); - hasher.update(data); - let hash = hasher.finalize(); - Self::Sha256(format!("{:x}", hash)) - } - - /// Create a `Checksum` from a known, previously computed hash. - pub fn sha256_from_str_unchecked(hash: &str) -> Self { - Self::Sha256(hash.to_string()) - } -} - -impl fmt::Display for Checksum { - /// Format a checksum for display. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Literal(s) => write!(f, "{}", s), - Self::Sha256(hash) => write!(f, "{}", hash), - } - } -} diff --git a/src/chunk.rs b/src/chunk.rs index df16a98..4f604b9 100644 --- a/src/chunk.rs +++ b/src/chunk.rs @@ -1,8 +1,8 @@ //! Chunks of data. -use crate::checksummer::Checksum; use crate::chunkid::ChunkId; use crate::chunkmeta::ChunkMeta; +use crate::label::Label; use serde::{Deserialize, Serialize}; use std::default::Default; @@ -97,7 +97,7 @@ impl GenerationChunk { let json: String = serde_json::to_string(self).map_err(GenerationChunkError::JsonGenerate)?; let bytes = json.as_bytes().to_vec(); - let checksum = Checksum::sha256(&bytes); + let checksum = Label::sha256(&bytes); let meta = ChunkMeta::new(&checksum); Ok(DataChunk::new(bytes, meta)) } @@ -185,7 +185,7 @@ impl ClientTrust { pub fn to_data_chunk(&self) -> Result { let json: String = serde_json::to_string(self).map_err(ClientTrustError::JsonGenerate)?; let bytes = json.as_bytes().to_vec(); - let checksum = Checksum::literal("client-trust"); + let checksum = Label::literal("client-trust"); let meta = ChunkMeta::new(&checksum); Ok(DataChunk::new(bytes, meta)) } diff --git a/src/chunker.rs b/src/chunker.rs index 7954621..2394230 100644 --- a/src/chunker.rs +++ b/src/chunker.rs @@ -1,8 +1,8 @@ //! Split file data into chunks. -use crate::checksummer::Checksum; use crate::chunk::DataChunk; use crate::chunkmeta::ChunkMeta; +use crate::label::Label; use std::io::prelude::*; use std::path::{Path, PathBuf}; @@ -54,7 +54,7 @@ impl FileChunks { } let buffer = &self.buf.as_slice()[..used]; - let hash = Checksum::sha256(buffer); + let hash = Label::sha256(buffer); let meta = ChunkMeta::new(&hash); let chunk = DataChunk::new(buffer.to_vec(), meta); Ok(Some(chunk)) diff --git a/src/chunkid.rs b/src/chunkid.rs index 3534627..50fc3d3 100644 --- a/src/chunkid.rs +++ b/src/chunkid.rs @@ -3,7 +3,7 @@ //! Chunk identifiers are chosen by the server. Each chunk has a //! unique identifier, which isn't based on the contents of the chunk. -use crate::checksummer::Checksum; +use crate::label::Label; use rusqlite::types::ToSqlOutput; use rusqlite::ToSql; use serde::{Deserialize, Serialize}; @@ -53,8 +53,8 @@ impl ChunkId { } /// Return the SHA256 checksum of the identifier. - pub fn sha256(&self) -> Checksum { - Checksum::sha256(self.id.as_bytes()) + pub fn sha256(&self) -> Label { + Label::sha256(self.id.as_bytes()) } } diff --git a/src/chunkmeta.rs b/src/chunkmeta.rs index 33c1070..1f591c6 100644 --- a/src/chunkmeta.rs +++ b/src/chunkmeta.rs @@ -1,6 +1,6 @@ //! Metadata about a chunk. -use crate::checksummer::Checksum; +use crate::label::Label; use serde::{Deserialize, Serialize}; use std::default::Default; use std::str::FromStr; @@ -37,9 +37,9 @@ impl ChunkMeta { /// Create a new data chunk. /// /// Data chunks are not for generations. - pub fn new(checksum: &Checksum) -> Self { + pub fn new(label: &Label) -> Self { ChunkMeta { - label: checksum.to_string(), + label: label.to_string(), } } @@ -79,20 +79,20 @@ impl FromStr for ChunkMeta { #[cfg(test)] mod test { - use super::{Checksum, ChunkMeta}; + use super::{ChunkMeta, Label}; #[test] fn new_creates_data_chunk() { - let sum = Checksum::sha256_from_str_unchecked("abcdef"); + let sum = Label::sha256(b"abcdef"); let meta = ChunkMeta::new(&sum); - assert_eq!(meta.label(), "abcdef"); + assert_eq!(meta.label(), &format!("{}", sum)); } #[test] fn new_generation_creates_generation_chunk() { - let sum = Checksum::sha256_from_str_unchecked("abcdef"); + let sum = Label::sha256(b"abcdef"); let meta = ChunkMeta::new(&sum); - assert_eq!(meta.label(), "abcdef"); + assert_eq!(meta.label(), &format!("{}", sum)); } #[test] @@ -113,7 +113,7 @@ mod test { #[test] fn generation_json_roundtrip() { - let sum = Checksum::sha256_from_str_unchecked("abcdef"); + let sum = Label::sha256(b"abcdef"); let meta = ChunkMeta::new(&sum); let json = serde_json::to_string(&meta).unwrap(); let meta2 = serde_json::from_str(&json).unwrap(); @@ -122,7 +122,7 @@ mod test { #[test] fn data_json_roundtrip() { - let sum = Checksum::sha256_from_str_unchecked("abcdef"); + let sum = Label::sha256(b"abcdef"); let meta = ChunkMeta::new(&sum); let json = meta.to_json_vec(); let meta2 = serde_json::from_slice(&json).unwrap(); diff --git a/src/cipher.rs b/src/cipher.rs index ee7fb8f..7bd2e84 100644 --- a/src/cipher.rs +++ b/src/cipher.rs @@ -191,15 +191,15 @@ impl Nonce { #[cfg(test)] mod test { - use crate::checksummer::Checksum; use crate::chunk::DataChunk; use crate::chunkmeta::ChunkMeta; use crate::cipher::{CipherEngine, CipherError, CHUNK_V1, NONCE_SIZE}; + use crate::label::Label; use crate::passwords::Passwords; #[test] fn metadata_as_aad() { - let sum = Checksum::sha256_from_str_unchecked("dummy-checksum"); + let sum = Label::sha256(b"dummy data"); let meta = ChunkMeta::new(&sum); let meta_as_aad = meta.to_json_vec(); let chunk = DataChunk::new("hello".as_bytes().to_vec(), meta); @@ -212,7 +212,7 @@ mod test { #[test] fn round_trip() { - let sum = Checksum::sha256_from_str_unchecked("dummy-checksum"); + let sum = Label::sha256(b"dummy data"); let meta = ChunkMeta::new(&sum); let chunk = DataChunk::new("hello".as_bytes().to_vec(), meta); let pass = Passwords::new("secret"); diff --git a/src/client.rs b/src/client.rs index 563921d..d8bf262 100644 --- a/src/client.rs +++ b/src/client.rs @@ -1,6 +1,5 @@ //! Client to the Obnam server HTTP API. -use crate::checksummer::Checksum; use crate::chunk::{ ClientTrust, ClientTrustError, DataChunk, GenerationChunk, GenerationChunkError, }; @@ -10,6 +9,7 @@ use crate::cipher::{CipherEngine, CipherError}; use crate::config::{ClientConfig, ClientConfigError}; use crate::generation::{FinishedGeneration, GenId, LocalGeneration, LocalGenerationError}; use crate::genlist::GenerationList; +use crate::label::Label; use log::{debug, error, info}; use reqwest::header::HeaderMap; @@ -196,7 +196,7 @@ impl BackupClient { } async fn find_client_trusts(&self) -> Result, ClientError> { - let label = format!("{}", Checksum::literal("client-trust")); + let label = format!("{}", Label::literal("client-trust")); let body = match self.get("", &[("label", &label)]).await { Ok((_, body)) => body, Err(err) => return Err(err), diff --git a/src/index.rs b/src/index.rs index 11f3480..5310a44 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,8 +1,8 @@ //! An on-disk index of chunks for the server. -use crate::checksummer::Checksum; use crate::chunkid::ChunkId; use crate::chunkmeta::ChunkMeta; +use crate::label::Label; use rusqlite::Connection; use std::path::Path; @@ -74,7 +74,7 @@ impl Index { #[cfg(test)] mod test { - use crate::checksummer::Checksum; + use super::Label; use super::{ChunkId, ChunkMeta, Index}; use std::path::Path; @@ -87,20 +87,20 @@ mod test { #[test] fn remembers_inserted() { let id: ChunkId = "id001".parse().unwrap(); - let sum = Checksum::sha256_from_str_unchecked("abc"); + let sum = Label::sha256(b"abc"); let meta = ChunkMeta::new(&sum); let dir = tempdir().unwrap(); let mut idx = new_index(dir.path()); idx.insert_meta(id.clone(), meta.clone()).unwrap(); assert_eq!(idx.get_meta(&id).unwrap(), meta); - let ids = idx.find_by_label("abc").unwrap(); + let ids = idx.find_by_label(&format!("{}", sum)).unwrap(); assert_eq!(ids, vec![id]); } #[test] fn does_not_find_uninserted() { let id: ChunkId = "id001".parse().unwrap(); - let sum = Checksum::sha256_from_str_unchecked("abc"); + let sum = Label::sha256(b"abc"); let meta = ChunkMeta::new(&sum); let dir = tempdir().unwrap(); let mut idx = new_index(dir.path()); @@ -111,19 +111,19 @@ mod test { #[test] fn removes_inserted() { let id: ChunkId = "id001".parse().unwrap(); - let sum = Checksum::sha256_from_str_unchecked("abc"); + let sum = Label::sha256(b"abc"); let meta = ChunkMeta::new(&sum); let dir = tempdir().unwrap(); let mut idx = new_index(dir.path()); idx.insert_meta(id.clone(), meta).unwrap(); idx.remove_meta(&id).unwrap(); - let ids: Vec = idx.find_by_label("abc").unwrap(); + let ids: Vec = idx.find_by_label(&format!("{}", sum)).unwrap(); assert_eq!(ids, vec![]); } } mod sql { - use super::{Checksum, IndexError}; + use super::{IndexError, Label}; use crate::chunkid::ChunkId; use crate::chunkmeta::ChunkMeta; use log::error; @@ -216,7 +216,7 @@ mod sql { fn row_to_meta(row: &Row) -> rusqlite::Result { let hash: String = row.get("label")?; - let sha256 = Checksum::sha256_from_str_unchecked(&hash); + let sha256 = Label::sha256_from_str_unchecked(&hash); Ok(ChunkMeta::new(&sha256)) } diff --git a/src/label.rs b/src/label.rs new file mode 100644 index 0000000..7ee55d1 --- /dev/null +++ b/src/label.rs @@ -0,0 +1,49 @@ +//! A chunk label. +//! +//! De-duplication of backed up data in Obnam relies on cryptographic +//! checksums. They are implemented in this module. Note that Obnam +//! does not aim to make these algorithms configurable, so only a very +//! small number of carefully chosen algorithms are supported here. + +use sha2::{Digest, Sha256}; +use std::fmt; + +/// A checksum of some data. +#[derive(Debug, Clone)] +pub enum Label { + /// An arbitrary, literal string. + Literal(String), + + /// A SHA256 checksum. + Sha256(String), +} + +impl Label { + /// Construct a literal string. + pub fn literal(s: &str) -> Self { + Self::Literal(s.to_string()) + } + + /// Compute a SHA256 checksum for a block of data. + pub fn sha256(data: &[u8]) -> Self { + let mut hasher = Sha256::new(); + hasher.update(data); + let hash = hasher.finalize(); + Self::Sha256(format!("{:x}", hash)) + } + + /// Create a `Checksum` from a known, previously computed hash. + pub fn sha256_from_str_unchecked(hash: &str) -> Self { + Self::Sha256(hash.to_string()) + } +} + +impl fmt::Display for Label { + /// Format a checksum for display. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Literal(s) => write!(f, "{}", s), + Self::Sha256(hash) => write!(f, "{}", hash), + } + } +} diff --git a/src/lib.rs b/src/lib.rs index a0a53c7..fbbea15 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,6 @@ pub mod accumulated_time; pub mod backup_progress; pub mod backup_reason; pub mod backup_run; -pub mod checksummer; pub mod chunk; pub mod chunker; pub mod chunkid; @@ -29,6 +28,7 @@ pub mod genlist; pub mod genmeta; pub mod index; pub mod indexedstore; +pub mod label; pub mod passwords; pub mod performance; pub mod policy; diff --git a/src/server.rs b/src/server.rs index 31a03fc..6b688d6 100644 --- a/src/server.rs +++ b/src/server.rs @@ -141,7 +141,7 @@ impl SearchHits { #[cfg(test)] mod test_search_hits { use super::{ChunkMeta, SearchHits}; - use crate::checksummer::Checksum; + use crate::label::Label; #[test] fn no_search_hits() { @@ -152,7 +152,7 @@ mod test_search_hits { #[test] fn one_search_hit() { let id = "abc".parse().unwrap(); - let sum = Checksum::sha256_from_str_unchecked("123"); + let sum = Label::sha256(b"123"); let meta = ChunkMeta::new(&sum); let mut hits = SearchHits::default(); hits.insert(id, meta); -- cgit v1.2.1 From 82ff782fe85c84c10f1f18c9bd5c2b017bc2f240 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 9 Apr 2022 09:40:59 +0300 Subject: feat! change how chunk labels are serialized Serialized labels now start with a type prefix: a character that says what type of label it is. This isn't strictly required: we _can_ just decide to always use a single type of checksum for all chunks in one backup, for one client, or in the whole repository. However, if it's ever possible to have more than one type, it helps debugging if every checksum, when serialized, is explicit about its type. Change things to use the new serialize method instead of the Display trait for Label. We're primarily serializing labels so they can be stored in a database, and used in URLs, only secondarily showing them to users. Sponsored-by: author --- obnam.md | 20 +++++++++---------- src/chunkmeta.rs | 6 +++--- src/client.rs | 2 +- src/index.rs | 6 +++--- src/label.rs | 59 +++++++++++++++++++++++++++++++++++++++++++++----------- 5 files changed, 65 insertions(+), 28 deletions(-) diff --git a/obnam.md b/obnam.md index a02aacb..381802c 100644 --- a/obnam.md +++ b/obnam.md @@ -1092,7 +1092,7 @@ storage of backed up data. ~~~scenario given a working Obnam system and a file data.dat containing some random data -when I POST data.dat to /v1/chunks, with chunk-meta: {"label":"abc"} +when I POST data.dat to /v1/chunks, with chunk-meta: {"label":"0abc"} then HTTP status code is 201 and content-type is application/json and the JSON body has a field chunk_id, henceforth ID @@ -1105,17 +1105,17 @@ We must be able to retrieve it. when I GET /v1/chunks/ then HTTP status code is 200 and content-type is application/octet-stream -and chunk-meta is {"label":"abc"} +and chunk-meta is {"label":"0abc"} and the body matches file data.dat ~~~ We must also be able to find it based on metadata. ~~~scenario -when I GET /v1/chunks?label=abc +when I GET /v1/chunks?label=0abc then HTTP status code is 200 and content-type is application/json -and the JSON body matches {"":{"label":"abc"}} +and the JSON body matches {"":{"label":"0abc"}} ~~~ Finally, we must be able to delete it. After that, we must not be able @@ -1128,7 +1128,7 @@ then HTTP status code is 200 when I GET /v1/chunks/ then HTTP status code is 404 -when I GET /v1/chunks?label=abc +when I GET /v1/chunks?label=0abc then HTTP status code is 200 and content-type is application/json and the JSON body matches {} @@ -1151,7 +1151,7 @@ We must get an empty result if searching for chunks that don't exist. ~~~scenario given a working Obnam system -when I GET /v1/chunks?label=abc +when I GET /v1/chunks?label=0abc then HTTP status code is 200 and content-type is application/json and the JSON body matches {} @@ -1178,7 +1178,7 @@ First, create a chunk. ~~~scenario given a working Obnam system and a file data.dat containing some random data -when I POST data.dat to /v1/chunks, with chunk-meta: {"label":"abc"} +when I POST data.dat to /v1/chunks, with chunk-meta: {"label":"0abc"} then HTTP status code is 201 and content-type is application/json and the JSON body has a field chunk_id, henceforth ID @@ -1194,10 +1194,10 @@ given a running chunk server Can we still find it by its metadata? ~~~scenario -when I GET /v1/chunks?label=abc +when I GET /v1/chunks?label=0abc then HTTP status code is 200 and content-type is application/json -and the JSON body matches {"":{"label":"abc"}} +and the JSON body matches {"":{"label":"0abc"}} ~~~ Can we still retrieve it by its identifier? @@ -1206,7 +1206,7 @@ Can we still retrieve it by its identifier? when I GET /v1/chunks/ then HTTP status code is 200 and content-type is application/octet-stream -and chunk-meta is {"label":"abc"} +and chunk-meta is {"label":"0abc"} and the body matches file data.dat ~~~ diff --git a/src/chunkmeta.rs b/src/chunkmeta.rs index 1f591c6..fe7ef4c 100644 --- a/src/chunkmeta.rs +++ b/src/chunkmeta.rs @@ -39,7 +39,7 @@ impl ChunkMeta { /// Data chunks are not for generations. pub fn new(label: &Label) -> Self { ChunkMeta { - label: label.to_string(), + label: label.serialize(), } } @@ -85,14 +85,14 @@ mod test { fn new_creates_data_chunk() { let sum = Label::sha256(b"abcdef"); let meta = ChunkMeta::new(&sum); - assert_eq!(meta.label(), &format!("{}", sum)); + assert_eq!(meta.label(), sum.serialize()); } #[test] fn new_generation_creates_generation_chunk() { let sum = Label::sha256(b"abcdef"); let meta = ChunkMeta::new(&sum); - assert_eq!(meta.label(), &format!("{}", sum)); + assert_eq!(meta.label(), sum.serialize()); } #[test] diff --git a/src/client.rs b/src/client.rs index d8bf262..bed5f1e 100644 --- a/src/client.rs +++ b/src/client.rs @@ -196,7 +196,7 @@ impl BackupClient { } async fn find_client_trusts(&self) -> Result, ClientError> { - let label = format!("{}", Label::literal("client-trust")); + let label = Label::literal("client-trust").serialize(); let body = match self.get("", &[("label", &label)]).await { Ok((_, body)) => body, Err(err) => return Err(err), diff --git a/src/index.rs b/src/index.rs index 5310a44..52da2f2 100644 --- a/src/index.rs +++ b/src/index.rs @@ -93,7 +93,7 @@ mod test { let mut idx = new_index(dir.path()); idx.insert_meta(id.clone(), meta.clone()).unwrap(); assert_eq!(idx.get_meta(&id).unwrap(), meta); - let ids = idx.find_by_label(&format!("{}", sum)).unwrap(); + let ids = idx.find_by_label(&sum.serialize()).unwrap(); assert_eq!(ids, vec![id]); } @@ -117,7 +117,7 @@ mod test { let mut idx = new_index(dir.path()); idx.insert_meta(id.clone(), meta).unwrap(); idx.remove_meta(&id).unwrap(); - let ids: Vec = idx.find_by_label(&format!("{}", sum)).unwrap(); + let ids: Vec = idx.find_by_label(&sum.serialize()).unwrap(); assert_eq!(ids, vec![]); } } @@ -216,7 +216,7 @@ mod sql { fn row_to_meta(row: &Row) -> rusqlite::Result { let hash: String = row.get("label")?; - let sha256 = Label::sha256_from_str_unchecked(&hash); + let sha256 = Label::deserialize(&hash).expect("deserialize checksum from database"); Ok(ChunkMeta::new(&sha256)) } diff --git a/src/label.rs b/src/label.rs index 7ee55d1..64be341 100644 --- a/src/label.rs +++ b/src/label.rs @@ -6,7 +6,9 @@ //! small number of carefully chosen algorithms are supported here. use sha2::{Digest, Sha256}; -use std::fmt; + +const LITERAL: char = '0'; +const SHA256: char = '1'; /// A checksum of some data. #[derive(Debug, Clone)] @@ -32,18 +34,53 @@ impl Label { Self::Sha256(format!("{:x}", hash)) } - /// Create a `Checksum` from a known, previously computed hash. - pub fn sha256_from_str_unchecked(hash: &str) -> Self { - Self::Sha256(hash.to_string()) + /// Serialize a label into a string representation. + pub fn serialize(&self) -> String { + match self { + Self::Literal(s) => format!("{}{}", LITERAL, s), + Self::Sha256(hash) => format!("{}{}", SHA256, hash), + } } -} -impl fmt::Display for Label { - /// Format a checksum for display. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Literal(s) => write!(f, "{}", s), - Self::Sha256(hash) => write!(f, "{}", hash), + /// De-serialize a label from its string representation. + pub fn deserialize(s: &str) -> Result { + if s.starts_with(LITERAL) { + Ok(Self::Literal(s[1..].to_string())) + } else if s.starts_with(SHA256) { + Ok(Self::Sha256(s[1..].to_string())) + } else { + Err(LabelError::UnknownType(s.to_string())) } } } + +/// Possible errors from dealing with chunk labels. +#[derive(Debug, thiserror::Error)] +pub enum LabelError { + /// Serialized label didn't start with a known type prefix. + #[error("Unknown label: {0:?}")] + UnknownType(String), +} + +#[cfg(test)] +mod test { + use super::Label; + + #[test] + fn roundtrip_literal() { + let label = Label::literal("dummy data"); + let serialized = label.serialize(); + let de = Label::deserialize(&serialized).unwrap(); + let seri2 = de.serialize(); + assert_eq!(serialized, seri2); + } + + #[test] + fn roundtrip_sha256() { + let label = Label::sha256(b"dummy data"); + let serialized = label.serialize(); + let de = Label::deserialize(&serialized).unwrap(); + let seri2 = de.serialize(); + assert_eq!(serialized, seri2); + } +} -- cgit v1.2.1 From 18c0f4afab29e17c050208234becbfb5e2973746 Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Sat, 9 Apr 2022 12:00:29 +0300 Subject: feat: use one checksum for all chunks in a backup When making a backup, use the same checksum for any chunks it re-uses or creates. This is for performance: if we allowed two checksums to be used, we would have to compute the checksum for a chunk twice, and potentially look up both on the server. This is just a lot of work. Instead, we use only one. The trade-off here is that when (not if) the user wants to switch to a new checksum type, they'll have to do a full backup, uploading all their data to the server, even when it's already there, just with a different checksum. Hopefully this will be rare. Full backups always use the built-in, hardcoded default checksum, and incremental backups use whatever the previous backup used. The default is still SHA256, but this commit add code to support BLAKE2 if we decide to switch that as a default. It's also easy to add support for others, now. BLAKE2 was added to verify that Obnam can actually handle the checksum changing (manual test: not in the test suite). I don't think users need to be offered even the option of choosing a checksum algorithm to use. When one cares about both security and performance, choosing a checksum requires specialist, expert knowledge. Obnam developers should choose the default. Giving users a knob they can twiddle just makes it that much harder to configure and use Obnam. If the choice Obnam developers have made is shown to be sub-optimal, it seems better to change the default for everyone, rather than hope that every user changes their configuration to gain the benefit. Experience has shown that people mostly don't change the default configuration, and that they are especially bad at choosing well when security is a concern. (Obnam is free software. Expert users can choose their checksum by changing the source code. I'm not fundamentally limiting anyone's freedom or choice here.) Users can switch to a new default algorithm by triggering a full backup with the new "obnam backup --full". Sponsored-by: author --- Cargo.lock | 10 ++++++++++ Cargo.toml | 1 + obnam.md | 4 +++- src/backup_run.rs | 22 +++++++++++++++++++--- src/chunker.rs | 16 +++++++++++++--- src/cmd/backup.rs | 52 +++++++++++++++++++++++++++++++--------------------- src/dbgen.rs | 40 ++++++++++++++++++++++++++++++++-------- src/error.rs | 5 +++++ src/generation.rs | 17 ++++++++++++----- src/genmeta.rs | 5 +++++ src/label.rs | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 11 files changed, 184 insertions(+), 42 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9dafc07..7715bf3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -113,6 +113,15 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "blake2" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9cf849ee05b2ee5fba5e36f97ff8ec2533916700fc0758d40d92136a42f3388" +dependencies = [ + "digest 0.10.3", +] + [[package]] name = "block-buffer" version = "0.9.0" @@ -977,6 +986,7 @@ version = "0.7.1" dependencies = [ "aes-gcm", "anyhow", + "blake2", "bytesize", "chrono", "directories-next", diff --git a/Cargo.toml b/Cargo.toml index 616a34a..36f1682 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ rust-version = "1.56.0" [dependencies] aes-gcm = "0.9" anyhow = "1" +blake2 = "0.10.4" bytesize = "1" chrono = "0.4" directories-next = "2" diff --git a/obnam.md b/obnam.md index 381802c..a6f9b4c 100644 --- a/obnam.md +++ b/obnam.md @@ -1901,7 +1901,9 @@ then stdout, as JSON, has all the values in file geninfo.json "major": 0, "minor": 0 }, - "extras": {} + "extras": { + "checksum_kind": "sha256" + } } ~~~ diff --git a/src/backup_run.rs b/src/backup_run.rs index 29e82fc..2418871 100644 --- a/src/backup_run.rs +++ b/src/backup_run.rs @@ -15,6 +15,7 @@ use crate::fsiter::{AnnotatedFsEntry, FsIterError, FsIterator}; use crate::generation::{ GenId, LocalGeneration, LocalGenerationError, NascentError, NascentGeneration, }; +use crate::label::LabelChecksumKind; use crate::performance::{Clock, Performance}; use crate::policy::BackupPolicy; use crate::schema::SchemaVersion; @@ -24,10 +25,12 @@ use chrono::{DateTime, Local}; use log::{debug, error, info, warn}; use std::path::{Path, PathBuf}; +const DEFAULT_CHECKSUM_KIND: LabelChecksumKind = LabelChecksumKind::Sha256; const SQLITE_CHUNK_SIZE: usize = MIB as usize; /// A running backup. pub struct BackupRun<'a> { + checksum_kind: Option, client: &'a BackupClient, policy: BackupPolicy, buffer_size: usize, @@ -105,6 +108,7 @@ impl<'a> BackupRun<'a> { /// Create a new run for an initial backup. pub fn initial(config: &ClientConfig, client: &'a BackupClient) -> Result { Ok(Self { + checksum_kind: Some(DEFAULT_CHECKSUM_KIND), client, policy: BackupPolicy::default(), buffer_size: config.chunk_size, @@ -118,6 +122,7 @@ impl<'a> BackupRun<'a> { client: &'a BackupClient, ) -> Result { Ok(Self { + checksum_kind: None, client, policy: BackupPolicy::default(), buffer_size: config.chunk_size, @@ -136,7 +141,7 @@ impl<'a> BackupRun<'a> { None => { // Create a new, empty generation. let schema = schema_version(DEFAULT_SCHEMA_MAJOR).unwrap(); - NascentGeneration::create(oldname, schema)?.close()?; + NascentGeneration::create(oldname, schema, self.checksum_kind.unwrap())?.close()?; // Open the newly created empty generation. Ok(LocalGeneration::open(oldname)?) @@ -146,6 +151,11 @@ impl<'a> BackupRun<'a> { let old = self.fetch_previous_generation(genid, oldname).await?; perf.stop(Clock::GenerationDownload); + let meta = old.meta()?; + if let Some(v) = meta.get("checksum_kind") { + self.checksum_kind = Some(LabelChecksumKind::from(v)?); + } + let progress = BackupProgress::incremental(); progress.files_in_previous_generation(old.file_count()? as u64); self.progress = Some(progress); @@ -155,6 +165,12 @@ impl<'a> BackupRun<'a> { } } + fn checksum_kind(&self) -> LabelChecksumKind { + self.checksum_kind + .or(Some(LabelChecksumKind::Sha256)) + .unwrap() + } + async fn fetch_previous_generation( &self, genid: &GenId, @@ -185,7 +201,7 @@ impl<'a> BackupRun<'a> { let mut warnings: Vec = vec![]; let mut new_cachedir_tags = vec![]; let files_count = { - let mut new = NascentGeneration::create(newpath, schema)?; + let mut new = NascentGeneration::create(newpath, schema, self.checksum_kind.unwrap())?; for root in &config.roots { match self.backup_one_root(config, old, &mut new, root).await { Ok(mut o) => { @@ -378,7 +394,7 @@ impl<'a> BackupRun<'a> { let mut chunk_ids = vec![]; let file = std::fs::File::open(filename) .map_err(|err| ClientError::FileOpen(filename.to_path_buf(), err))?; - let chunker = FileChunks::new(size, file, filename); + let chunker = FileChunks::new(size, file, filename, self.checksum_kind()); for item in chunker { let chunk = item?; if let Some(chunk_id) = self.client.has_chunk(chunk.meta()).await? { diff --git a/src/chunker.rs b/src/chunker.rs index 2394230..29f8a90 100644 --- a/src/chunker.rs +++ b/src/chunker.rs @@ -2,13 +2,14 @@ use crate::chunk::DataChunk; use crate::chunkmeta::ChunkMeta; -use crate::label::Label; +use crate::label::{Label, LabelChecksumKind}; use std::io::prelude::*; use std::path::{Path, PathBuf}; /// Iterator over chunks in a file. pub struct FileChunks { chunk_size: usize, + kind: LabelChecksumKind, buf: Vec, filename: PathBuf, handle: std::fs::File, @@ -24,11 +25,17 @@ pub enum ChunkerError { impl FileChunks { /// Create new iterator. - pub fn new(chunk_size: usize, handle: std::fs::File, filename: &Path) -> Self { + pub fn new( + chunk_size: usize, + handle: std::fs::File, + filename: &Path, + kind: LabelChecksumKind, + ) -> Self { let mut buf = vec![]; buf.resize(chunk_size, 0); Self { chunk_size, + kind, buf, handle, filename: filename.to_path_buf(), @@ -54,7 +61,10 @@ impl FileChunks { } let buffer = &self.buf.as_slice()[..used]; - let hash = Label::sha256(buffer); + let hash = match self.kind { + LabelChecksumKind::Blake2 => Label::blake2(buffer), + LabelChecksumKind::Sha256 => Label::sha256(buffer), + }; let meta = ChunkMeta::new(&hash); let chunk = DataChunk::new(buffer.to_vec(), meta); Ok(Some(chunk)) diff --git a/src/cmd/backup.rs b/src/cmd/backup.rs index 8a85703..60045cc 100644 --- a/src/cmd/backup.rs +++ b/src/cmd/backup.rs @@ -19,7 +19,11 @@ use tokio::runtime::Runtime; /// Make a backup. #[derive(Debug, StructOpt)] pub struct Backup { - /// Backup schema major version. + /// Force a full backup, instead of an incremental one. + #[structopt(long)] + full: bool, + + /// Backup schema major version to use. #[structopt(long)] backup_version: Option, } @@ -53,29 +57,35 @@ impl Backup { let oldtemp = temp.path().join("old.db"); let newtemp = temp.path().join("new.db"); - let (is_incremental, outcome) = match genlist.resolve("latest") { - Err(_) => { - info!("fresh backup without a previous generation"); - let mut run = BackupRun::initial(config, &client)?; - let old = run.start(None, &oldtemp, perf).await?; - ( - false, - run.backup_roots(config, &old, &newtemp, schema, perf) - .await?, - ) - } - Ok(old_id) => { - info!("incremental backup based on {}", old_id); - let mut run = BackupRun::incremental(config, &client)?; - let old = run.start(Some(&old_id), &oldtemp, perf).await?; - ( - true, - run.backup_roots(config, &old, &newtemp, schema, perf) - .await?, - ) + let old_id = if self.full { + None + } else { + match genlist.resolve("latest") { + Err(_) => None, + Ok(old_id) => Some(old_id), } }; + let (is_incremental, outcome) = if let Some(old_id) = old_id { + info!("incremental backup based on {}", old_id); + let mut run = BackupRun::incremental(config, &client)?; + let old = run.start(Some(&old_id), &oldtemp, perf).await?; + ( + true, + run.backup_roots(config, &old, &newtemp, schema, perf) + .await?, + ) + } else { + info!("fresh backup without a previous generation"); + let mut run = BackupRun::initial(config, &client)?; + let old = run.start(None, &oldtemp, perf).await?; + ( + false, + run.backup_roots(config, &old, &newtemp, schema, perf) + .await?, + ) + }; + perf.start(Clock::GenerationUpload); let mut trust = trust; trust.append_backup(outcome.gen_id.as_chunk_id()); diff --git a/src/dbgen.rs b/src/dbgen.rs index 816ea11..8e5ece5 100644 --- a/src/dbgen.rs +++ b/src/dbgen.rs @@ -5,6 +5,7 @@ use crate::chunkid::ChunkId; use crate::db::{Column, Database, DatabaseError, SqlResults, Table, Value}; use crate::fsentry::FilesystemEntry; use crate::genmeta::{GenerationMeta, GenerationMetaError}; +use crate::label::LabelChecksumKind; use crate::schema::{SchemaVersion, VersionComponent}; use log::error; use std::collections::HashMap; @@ -90,14 +91,15 @@ impl GenerationDb { pub fn create>( filename: P, schema: SchemaVersion, + checksum_kind: LabelChecksumKind, ) -> Result { let meta_table = Self::meta_table(); let variant = match schema.version() { (V0_0::MAJOR, V0_0::MINOR) => { - GenerationDbVariant::V0_0(V0_0::create(filename, meta_table)?) + GenerationDbVariant::V0_0(V0_0::create(filename, meta_table, checksum_kind)?) } (V1_0::MAJOR, V1_0::MINOR) => { - GenerationDbVariant::V1_0(V1_0::create(filename, meta_table)?) + GenerationDbVariant::V1_0(V1_0::create(filename, meta_table, checksum_kind)?) } (major, minor) => return Err(GenerationDbError::Incompatible(major, minor)), }; @@ -240,11 +242,15 @@ impl V0_0 { const MINOR: VersionComponent = 0; /// Create a new generation database in read/write mode. - pub fn create>(filename: P, meta: Table) -> Result { + pub fn create>( + filename: P, + meta: Table, + checksum_kind: LabelChecksumKind, + ) -> Result { let db = Database::create(filename.as_ref())?; let mut moi = Self::new(db, meta); moi.created = true; - moi.create_tables()?; + moi.create_tables(checksum_kind)?; Ok(moi) } @@ -276,7 +282,7 @@ impl V0_0 { } } - fn create_tables(&mut self) -> Result<(), GenerationDbError> { + fn create_tables(&mut self, checksum_kind: LabelChecksumKind) -> Result<(), GenerationDbError> { self.db.create_table(&self.meta)?; self.db.create_table(&self.files)?; self.db.create_table(&self.chunks)?; @@ -295,6 +301,13 @@ impl V0_0 { Value::text("value", &format!("{}", Self::MINOR)), ], )?; + self.db.insert( + &self.meta, + &[ + Value::text("key", "checksum_kind"), + Value::text("value", checksum_kind.serialize()), + ], + )?; Ok(()) } @@ -483,11 +496,15 @@ impl V1_0 { const MINOR: VersionComponent = 0; /// Create a new generation database in read/write mode. - pub fn create>(filename: P, meta: Table) -> Result { + pub fn create>( + filename: P, + meta: Table, + checksum_kind: LabelChecksumKind, + ) -> Result { let db = Database::create(filename.as_ref())?; let mut moi = Self::new(db, meta); moi.created = true; - moi.create_tables()?; + moi.create_tables(checksum_kind)?; Ok(moi) } @@ -519,7 +536,7 @@ impl V1_0 { } } - fn create_tables(&mut self) -> Result<(), GenerationDbError> { + fn create_tables(&mut self, checksum_kind: LabelChecksumKind) -> Result<(), GenerationDbError> { self.db.create_table(&self.meta)?; self.db.create_table(&self.files)?; self.db.create_table(&self.chunks)?; @@ -538,6 +555,13 @@ impl V1_0 { Value::text("value", &format!("{}", Self::MINOR)), ], )?; + self.db.insert( + &self.meta, + &[ + Value::text("key", "checksum_kind"), + Value::text("value", checksum_kind.serialize()), + ], + )?; Ok(()) } diff --git a/src/error.rs b/src/error.rs index 9c9b432..928f258 100644 --- a/src/error.rs +++ b/src/error.rs @@ -10,6 +10,7 @@ use crate::db::DatabaseError; use crate::dbgen::GenerationDbError; use crate::generation::{LocalGenerationError, NascentError}; use crate::genlist::GenerationListError; +use crate::label::LabelError; use crate::passwords::PasswordError; use std::path::PathBuf; use std::time::SystemTimeError; @@ -22,6 +23,10 @@ use tempfile::PersistError; /// convenience. #[derive(Debug, thiserror::Error)] pub enum ObnamError { + /// Error from chunk labels. + #[error(transparent)] + Label(#[from] LabelError), + /// Error listing generations on server. #[error(transparent)] GenerationListError(#[from] GenerationListError), diff --git a/src/generation.rs b/src/generation.rs index 715b426..0a0fc77 100644 --- a/src/generation.rs +++ b/src/generation.rs @@ -6,6 +6,7 @@ use crate::db::{DatabaseError, SqlResults}; use crate::dbgen::{FileId, GenerationDb, GenerationDbError}; use crate::fsentry::FilesystemEntry; use crate::genmeta::{GenerationMeta, GenerationMetaError}; +use crate::label::LabelChecksumKind; use crate::schema::{SchemaVersion, VersionComponent}; use std::fmt; use std::path::{Path, PathBuf}; @@ -77,11 +78,15 @@ pub enum NascentError { impl NascentGeneration { /// Create a new nascent generation. - pub fn create

(filename: P, schema: SchemaVersion) -> Result + pub fn create

( + filename: P, + schema: SchemaVersion, + checksum_kind: LabelChecksumKind, + ) -> Result where P: AsRef, { - let db = GenerationDb::create(filename.as_ref(), schema)?; + let db = GenerationDb::create(filename.as_ref(), schema, checksum_kind)?; Ok(Self { db, fileno: 0 }) } @@ -290,7 +295,7 @@ impl LocalGeneration { #[cfg(test)] mod test { - use super::{LocalGeneration, NascentGeneration, SchemaVersion}; + use super::{LabelChecksumKind, LocalGeneration, NascentGeneration, SchemaVersion}; use tempfile::NamedTempFile; #[test] @@ -298,7 +303,8 @@ mod test { let filename = NamedTempFile::new().unwrap().path().to_path_buf(); let schema = SchemaVersion::new(0, 0); { - let mut _gen = NascentGeneration::create(&filename, schema).unwrap(); + let mut _gen = + NascentGeneration::create(&filename, schema, LabelChecksumKind::Sha256).unwrap(); // _gen is dropped here; the connection is close; the file // should not be removed. } @@ -328,7 +334,8 @@ mod test { let tag_path2 = Path::new("/another_dir/a_tag"); let schema = SchemaVersion::new(0, 0); - let mut gen = NascentGeneration::create(&dbfile, schema).unwrap(); + let mut gen = + NascentGeneration::create(&dbfile, schema, LabelChecksumKind::Sha256).unwrap(); let mut cache = users::UsersCache::new(); gen.insert( diff --git a/src/genmeta.rs b/src/genmeta.rs index 2ce4c4c..d5b14a3 100644 --- a/src/genmeta.rs +++ b/src/genmeta.rs @@ -26,6 +26,11 @@ impl GenerationMeta { pub fn schema_version(&self) -> SchemaVersion { self.schema_version } + + /// Get a value corresponding to a key in the meta table. + pub fn get(&self, key: &str) -> Option<&String> { + self.extras.get(key) + } } fn metastr(map: &mut HashMap, key: &str) -> Result { diff --git a/src/label.rs b/src/label.rs index 64be341..19d270a 100644 --- a/src/label.rs +++ b/src/label.rs @@ -5,10 +5,12 @@ //! does not aim to make these algorithms configurable, so only a very //! small number of carefully chosen algorithms are supported here. +use blake2::Blake2s256; use sha2::{Digest, Sha256}; const LITERAL: char = '0'; const SHA256: char = '1'; +const BLAKE2: char = '2'; /// A checksum of some data. #[derive(Debug, Clone)] @@ -18,6 +20,9 @@ pub enum Label { /// A SHA256 checksum. Sha256(String), + + /// A BLAKE2s checksum. + Blake2(String), } impl Label { @@ -34,11 +39,20 @@ impl Label { Self::Sha256(format!("{:x}", hash)) } + /// Compute a BLAKE2s checksum for a block of data. + pub fn blake2(data: &[u8]) -> Self { + let mut hasher = Blake2s256::new(); + hasher.update(data); + let hash = hasher.finalize(); + Self::Sha256(format!("{:x}", hash)) + } + /// Serialize a label into a string representation. pub fn serialize(&self) -> String { match self { Self::Literal(s) => format!("{}{}", LITERAL, s), Self::Sha256(hash) => format!("{}{}", SHA256, hash), + Self::Blake2(hash) => format!("{}{}", BLAKE2, hash), } } @@ -54,6 +68,37 @@ impl Label { } } +/// Kinds of checksum labels. +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum LabelChecksumKind { + /// Use a Blake2 checksum. + Blake2, + + /// Use a SHA256 checksum. + Sha256, +} + +impl LabelChecksumKind { + /// Parse a string into a label checksum kind. + pub fn from(s: &str) -> Result { + if s == "sha256" { + Ok(Self::Sha256) + } else if s == "blake2" { + Ok(Self::Blake2) + } else { + Err(LabelError::UnknownType(s.to_string())) + } + } + + /// Serialize a checksum kind into a string. + pub fn serialize(self) -> &'static str { + match self { + Self::Sha256 => "sha256", + Self::Blake2 => "blake2", + } + } +} + /// Possible errors from dealing with chunk labels. #[derive(Debug, thiserror::Error)] pub enum LabelError { @@ -64,7 +109,7 @@ pub enum LabelError { #[cfg(test)] mod test { - use super::Label; + use super::{Label, LabelChecksumKind}; #[test] fn roundtrip_literal() { @@ -83,4 +128,11 @@ mod test { let seri2 = de.serialize(); assert_eq!(serialized, seri2); } + + #[test] + fn roundtrip_checksum_kind() { + for kind in [LabelChecksumKind::Sha256, LabelChecksumKind::Blake2] { + assert_eq!(LabelChecksumKind::from(kind.serialize()).unwrap(), kind); + } + } } -- cgit v1.2.1