From ce43013f143adc3251fd4d797060b385d5c2506d Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Fri, 21 May 2021 18:50:25 +0300 Subject: add backup talk --- 2021-05-24-backup-tech.md | 263 ++++++++++++++++++++++++++++++++++++++++++++++ Makefile | 2 +- 2 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 2021-05-24-backup-tech.md diff --git a/2021-05-24-backup-tech.md b/2021-05-24-backup-tech.md new file mode 100644 index 0000000..5f09c93 --- /dev/null +++ b/2021-05-24-backup-tech.md @@ -0,0 +1,263 @@ +![](Box.jpg) + +Broc, [`commons.wikimedia.org/wiki/File:Cat_into_the_box.jpg`](https://commons.wikimedia.org/wiki/File:Cat_into_the_box.jpg) + + +----------------------------------------------------------------------------- + +Backups are easy. + +~~~dot +digraph "" { + +data [label="Data" shape=cylinder]; +backup [label="Backup" shape=cylinder]; + +data -> backup [label="copy"]; + +} +~~~ + +----------------------------------------------------------------------------- + + +The end. That's all you need to know about backups. + +----------------------------------------------------------------------------- + +OK, so there's a little more to it. + +----------------------------------------------------------------------------- + +# Challenges + +* Full copy takes too long if you do it every time. **Incremental** + backups only copy what's changed. + +* A sequence of backups over time would be nice, but take up a lot of + space without **de-duplication**. + +* Storage is expensive, so **compression** would be nice. + +* Backups can be stolen, so it'd be nice for them to be **encrypted**. + +* Backups can be tampered with, so it'd be nice for them to be + **authenticated** (digitally signed). Otherwise you don't know if + you can trust the data you get when restoring. + +----------------------------------------------------------------------------- + +# Incremental backups + +Compare live data against most recent backup. If file has hasn't +changed, assume the previous copy is still good and use that instead +of making a new copy. + +- some assumptions are necessary here to get performance: simplified: + if file size is the same and file modification time is the same, the + file content probably hasn't changed + +- not having to read and store every file every time is a huge time + saver + +- not having to store every file every time saves space, but that's + less important than time savings + +----------------------------------------------------------------------------- + +# De-duplication + +Store each distinct bit of data only once, across all backups. + +Whole files is easy, but unsatisfactory. + +Split files into smaller chunks for better performance, at some +expense of more book keeping overhead. Store each distinct chunk +separately. + +Splitting by size (8 KiB?) is easy. Split using a rolling checksum for +handling inserted data, e.g., identical attachments in email spools. +Chunk ends when checksum has N lowest bits zero, or when it reaches a +certain size. + +----------------------------------------------------------------------------- + +# Compare chunks + +Strong cryptographic checksum: SHA256? + +- when do collisions matter? + +Bit by bit. + +- may have bad performance + +----------------------------------------------------------------------------- + +# Compression + +Lossless compression is easy. + +Low compression → really fast (faster than disk I/O). + +Higher compression → more CPU, takes more time. + +Lossy compression → replace every picture with the "Cat in a box" + +----------------------------------------------------------------------------- + +# Encryption and authentication + +Authenticated encryption. Roughly: encrypt data with one key, then +compute strong hash of ciphertext, and encrypt that with second key, +store both ciphertexts. + +When restoring, decrypt encrypted hash, compare that to encrypted +data, and only if they match, decrypt data. + +If stored chunk has been modified in any way, it's detected. + +(Read details: +[`en.wikipedia.org/wiki/Authenticated_encryption`](https://en.wikipedia.org/wiki/Authenticated_encryption)) + +----------------------------------------------------------------------------- + +# Error correction + +Detecting errors isn't enough. Sometimes backups deteriorate. Error +correcting codes would help. Or store each backup multiple times a la +RAID. + + +----------------------------------------------------------------------------- + +~~~dot +digraph "" { + +file1 [label="File 1" shape=tab]; +file2 [label="File 2" shape=tab]; +file3 [label="File 3" shape=tab]; + +chunk1 [label="Chunk 1" shape=box]; +chunk2 [label="Chunk 2" shape=box]; + +backup [label="Backup process"]; + +backup1 [label="Backup 1: \n file 1, file 2, file 3" shape=note]; +backup2 [label="Backup 2: \n file 1, file 2, file 3" shape=note]; + +file1 -> backup; +file2 -> backup; +file3 -> backup; + +backup -> backup1; +backup -> backup2; + +backup1 -> chunk1; +backup1 -> chunk2; + +backup2 -> chunk1; +backup2 -> chunk2; + +} +~~~ + + +----------------------------------------------------------------------------- + +# Summary + +Backups are easy. + +~~~dot +digraph "" { + +data [label="Data" shape=cylinder]; +backup [label="Backup" shape=cylinder]; + +data -> backup [label="run tool"]; + +} +~~~ + +The end. That's all you need to know about backups. + +----------------------------------------------------------------------------- + +# Backups could be more easy + +Backup tools need to be installed, configured, and used. Backup +storage needs to be arranged. + +What if backups just happen without you needing to do anything? + +~~~dot +digraph "" { + +data [label="Data" shape=cylinder]; +backup [label="Backup" shape=cylinder]; + +data -> backup [label="just happens"]; + +} +~~~ + +----------------------------------------------------------------------------- + +# My current crazy idea: Peer to peer backups + + +Safe, secure, without having to provide separate backup space. + +Backup software comes as port of the operating system by default, and +runs automatically, when data changes, in a way that doesn't bother +the user. (Feasible for Linux distributions. The rest of the world +will follow.) + +You allow others to store some of their backups on your computer, and +they let you store some of yours on theirs. + +Network effect: this works better the more people use it. + +(Some details need to be sorted out.) + +----------------------------------------------------------------------------- + +~~~dot +digraph "" { + +computer1 [label="Alice" shape=cylinder]; +computer2 [label="Bob" shape=cylinder]; +computer3 [label="Charlie" shape=cylinder]; + +computer1 -> computer2; +computer1 -> computer3; + +computer2 -> computer1; +computer2 -> computer3; + +computer3 -> computer2; + + +} +~~~ + + +----------------------------------------------------------------------------- + +# Legalese + +Copyright 2021 Lars Wirzenius + +This content is licensed under the Creative Commons +Attribution-ShareAlike 4.0 International ([CC BY-SA 4.0][]) licence. + +[CC BY-SA 4.0]: https://creativecommons.org/licenses/by-sa/4.0/ + + +--- +title: "Introduction to backup technology" +subtitle: "An opinionated view" +author: "Lars Wirzenius" +date: "2021-05-24" +... diff --git a/Makefile b/Makefile index 2c08234..8089345 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ all: $(pdfs) .SUFFIXES: .md .pdf .dot .svg .md.pdf: - pandoc --filter sp-filter -t beamer -o $@ $< + pandoc --filter subplot-filter -t beamer -o $@ $< .dot.svg: dot -Tsvg $< > $@ -- cgit v1.2.1