From aa6709b2f657a964c2af04f6370e7eb7c117bf9e Mon Sep 17 00:00:00 2001 From: Lars Wirzenius Date: Fri, 27 Nov 2020 16:57:21 +0200 Subject: feat: implement Summain in Rust --- .gitignore | 5 + Cargo.toml | 5 + check | 35 +++++++ src/lib.rs | 1 + src/main.rs | 70 +++++++++++++- subplot/runcmd.py | 252 +++++++++++++++++++++++++++++++++++++++++++++++++++ subplot/runcmd.yaml | 83 +++++++++++++++++ subplot/summain.py | 36 ++++++++ subplot/summain.yaml | 17 ++++ summain.md | 187 ++++++++++++++++++++++++++++++++++++++ 10 files changed, 689 insertions(+), 2 deletions(-) create mode 100755 check create mode 100644 src/lib.rs create mode 100644 subplot/runcmd.py create mode 100644 subplot/runcmd.yaml create mode 100644 subplot/summain.py create mode 100644 subplot/summain.yaml create mode 100644 summain.md diff --git a/.gitignore b/.gitignore index ea8c4bf..92aa39b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ /target +Cargo.lock +summain.html +summain.pdf +test.log +test.py diff --git a/Cargo.toml b/Cargo.toml index 61ed129..b570500 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,8 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1" +serde = { version = "1", features = ["derive"] } +serde_yaml = "0.8" +structopt = "0.3" +unix_mode = "0.1" diff --git a/check b/check new file mode 100755 index 0000000..0445bf5 --- /dev/null +++ b/check @@ -0,0 +1,35 @@ +#!/bin/bash +# +# Run automated tests for the project. + +set -euo pipefail + +quiet=-q +hideok=chronic +if [ "$#" -gt 0 ] +then + case "$1" in + verbose | -v | --verbose) + quiet= + hideok= + ;; + esac +fi + +got_cargo_cmd() +{ + cargo --list | grep " $1 " > /dev/null +} + +cargo build --all-targets $quiet +got_cargo_cmd clippy && cargo clippy $quiet +got_cargo_cmd fmt && cargo fmt -- --check + +sp-docgen summain.md -o summain.html +sp-docgen summain.md -o summain.pdf + +sp-codegen summain.md -o test.py +rm -f test.log +$hideok python3 test.py --log test.log "$@" + +echo "Everything seems to be in order." diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ + diff --git a/src/main.rs b/src/main.rs index e7a11a9..fd23eb8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,69 @@ -fn main() { - println!("Hello, world!"); +use anyhow::Context; +use serde::Serialize; +use std::fs::symlink_metadata; +use std::fs::Metadata; +use std::os::linux::fs::MetadataExt; +use std::path::{Path, PathBuf}; +use structopt::StructOpt; + +fn main() -> anyhow::Result<()> { + let mut opt = Opt::from_args(); + opt.pathnames[..].sort(); + for pathname in opt.pathnames { + report(&pathname).with_context(|| format!("{}", pathname.display()))? + } + Ok(()) +} + +#[derive(StructOpt, Debug)] +struct Opt { + #[structopt(parse(from_os_str))] + pathnames: Vec, +} + +#[derive(Serialize, Debug)] +struct Entry { + path: PathBuf, + atime: i64, + atime_nsec: i64, + #[serde(with = "mode")] + mode: u32, + mtime: i64, + mtime_nsec: i64, + nlink: u64, + size: Option, +} + +impl Entry { + fn new(path: &Path, m: Metadata) -> Self { + Self { + path: path.to_path_buf(), + atime: m.st_atime(), + atime_nsec: m.st_atime_nsec(), + mode: m.st_mode(), + mtime: m.st_mtime(), + mtime_nsec: m.st_mtime_nsec(), + nlink: m.st_nlink(), + size: if m.is_dir() { None } else { Some(m.st_size()) }, + } + } +} + +fn report(pathname: &Path) -> anyhow::Result<()> { + let m = symlink_metadata(pathname)?; + let e = Entry::new(pathname, m); + println!("{}", serde_yaml::to_string(&e)?); + Ok(()) +} + +mod mode { + use serde::{self, Serializer}; + + pub fn serialize(mode: &u32, serializer: S) -> Result + where + S: Serializer, + { + let s = unix_mode::to_string(*mode); + serializer.serialize_str(&s) + } } diff --git a/subplot/runcmd.py b/subplot/runcmd.py new file mode 100644 index 0000000..a2564c6 --- /dev/null +++ b/subplot/runcmd.py @@ -0,0 +1,252 @@ +import logging +import os +import re +import shlex +import subprocess + + +# +# Helper functions. +# + +# Get exit code or other stored data about the latest command run by +# runcmd_run. + + +def _runcmd_get(ctx, name): + ns = ctx.declare("_runcmd") + return ns[name] + + +def runcmd_get_exit_code(ctx): + return _runcmd_get(ctx, "exit") + + +def runcmd_get_stdout(ctx): + return _runcmd_get(ctx, "stdout") + + +def runcmd_get_stdout_raw(ctx): + return _runcmd_get(ctx, "stdout.raw") + + +def runcmd_get_stderr(ctx): + return _runcmd_get(ctx, "stderr") + + +def runcmd_get_stderr_raw(ctx): + return _runcmd_get(ctx, "stderr.raw") + + +def runcmd_get_argv(ctx): + return _runcmd_get(ctx, "argv") + + +# Run a command, given an argv and other arguments for subprocess.Popen. +# +# This is meant to be a helper function, not bound directly to a step. The +# stdout, stderr, and exit code are stored in the "_runcmd" namespace in the +# ctx context. +def runcmd_run(ctx, argv, **kwargs): + ns = ctx.declare("_runcmd") + + # The Subplot Python template empties os.environ at startup, modulo a small + # number of variables with carefully chosen values. Here, we don't need to + # care about what those variables are, but we do need to not overwrite + # them, so we just add anything in the env keyword argument, if any, to + # os.environ. + env = dict(os.environ) + for key, arg in kwargs.pop("env", {}).items(): + env[key] = arg + + pp = ns.get("path-prefix") + if pp: + env["PATH"] = pp + ":" + env["PATH"] + + logging.debug(f"runcmd_run") + logging.debug(f" argv: {argv}") + logging.debug(f" env: {env}") + p = subprocess.Popen( + argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, **kwargs + ) + stdout, stderr = p.communicate("") + ns["argv"] = argv + ns["stdout.raw"] = stdout + ns["stderr.raw"] = stderr + ns["stdout"] = stdout.decode("utf-8") + ns["stderr"] = stderr.decode("utf-8") + ns["exit"] = p.returncode + logging.debug(f" ctx: {ctx}") + logging.debug(f" ns: {ns}") + + +# Step: prepend srcdir to PATH whenever runcmd runs a command. +def runcmd_helper_srcdir_path(ctx): + srcdir = globals()["srcdir"] + runcmd_prepend_to_path(ctx, srcdir) + + +# Step: This creates a helper script. +def runcmd_helper_script(ctx, filename=None): + get_file = globals()["get_file"] + with open(filename, "wb") as f: + f.write(get_file(filename)) + + +# +# Step functions for running commands. +# + + +def runcmd_prepend_to_path(ctx, dirname=None): + ns = ctx.declare("_runcmd") + pp = ns.get("path-prefix", "") + if pp: + pp = f"{pp}:{dirname}" + else: + pp = dirname + ns["path-prefix"] = pp + + +def runcmd_step(ctx, argv0=None, args=None): + runcmd_try_to_run(ctx, argv0=argv0, args=args) + runcmd_exit_code_is_zero(ctx) + + +def runcmd_try_to_run(ctx, argv0=None, args=None): + argv = [shlex.quote(argv0)] + shlex.split(args) + runcmd_run(ctx, argv) + + +# +# Step functions for examining exit codes. +# + + +def runcmd_exit_code_is_zero(ctx): + runcmd_exit_code_is(ctx, exit=0) + + +def runcmd_exit_code_is(ctx, exit=None): + assert_eq = globals()["assert_eq"] + assert_eq(runcmd_get_exit_code(ctx), int(exit)) + + +def runcmd_exit_code_is_nonzero(ctx): + runcmd_exit_code_is_not(ctx, exit=0) + + +def runcmd_exit_code_is_not(ctx, exit=None): + assert_ne = globals()["assert_ne"] + assert_ne(runcmd_get_exit_code(ctx), int(exit)) + + +# +# Step functions and helpers for examining output in various ways. +# + + +def runcmd_stdout_is(ctx, text=None): + _runcmd_output_is(runcmd_get_stdout(ctx), text) + + +def runcmd_stdout_isnt(ctx, text=None): + _runcmd_output_isnt(runcmd_get_stdout(ctx), text) + + +def runcmd_stderr_is(ctx, text=None): + _runcmd_output_is(runcmd_get_stderr(ctx), text) + + +def runcmd_stderr_isnt(ctx, text=None): + _runcmd_output_isnt(runcmd_get_stderr(ctx), text) + + +def _runcmd_output_is(actual, wanted): + assert_eq = globals()["assert_eq"] + wanted = bytes(wanted, "utf8").decode("unicode_escape") + logging.debug("_runcmd_output_is:") + logging.debug(f" actual: {actual!r}") + logging.debug(f" wanted: {wanted!r}") + assert_eq(actual, wanted) + + +def _runcmd_output_isnt(actual, wanted): + assert_ne = globals()["assert_ne"] + wanted = bytes(wanted, "utf8").decode("unicode_escape") + logging.debug("_runcmd_output_isnt:") + logging.debug(f" actual: {actual!r}") + logging.debug(f" wanted: {wanted!r}") + assert_ne(actual, wanted) + + +def runcmd_stdout_contains(ctx, text=None): + _runcmd_output_contains(runcmd_get_stdout(ctx), text) + + +def runcmd_stdout_doesnt_contain(ctx, text=None): + _runcmd_output_doesnt_contain(runcmd_get_stdout(ctx), text) + + +def runcmd_stderr_contains(ctx, text=None): + _runcmd_output_contains(runcmd_get_stderr(ctx), text) + + +def runcmd_stderr_doesnt_contain(ctx, text=None): + _runcmd_output_doesnt_contain(runcmd_get_stderr(ctx), text) + + +def _runcmd_output_contains(actual, wanted): + assert_eq = globals()["assert_eq"] + wanted = bytes(wanted, "utf8").decode("unicode_escape") + logging.debug("_runcmd_output_contains:") + logging.debug(f" actual: {actual!r}") + logging.debug(f" wanted: {wanted!r}") + assert_eq(wanted in actual, True) + + +def _runcmd_output_doesnt_contain(actual, wanted): + assert_ne = globals()["assert_ne"] + wanted = bytes(wanted, "utf8").decode("unicode_escape") + logging.debug("_runcmd_output_doesnt_contain:") + logging.debug(f" actual: {actual!r}") + logging.debug(f" wanted: {wanted!r}") + assert_ne(wanted in actual, True) + + +def runcmd_stdout_matches_regex(ctx, regex=None): + _runcmd_output_matches_regex(runcmd_get_stdout(ctx), regex) + + +def runcmd_stdout_doesnt_match_regex(ctx, regex=None): + _runcmd_output_doesnt_match_regex(runcmd_get_stdout(ctx), regex) + + +def runcmd_stderr_matches_regex(ctx, regex=None): + _runcmd_output_matches_regex(runcmd_get_stderr(ctx), regex) + + +def runcmd_stderr_doesnt_match_regex(ctx, regex=None): + _runcmd_output_doesnt_match_regex(runcmd_get_stderr(ctx), regex) + + +def _runcmd_output_matches_regex(actual, regex): + assert_ne = globals()["assert_ne"] + r = re.compile(regex) + m = r.search(actual) + logging.debug("_runcmd_output_matches_regex:") + logging.debug(f" actual: {actual!r}") + logging.debug(f" regex: {regex!r}") + logging.debug(f" match: {m}") + assert_ne(m, None) + + +def _runcmd_output_doesnt_match_regex(actual, regex): + assert_eq = globals()["assert_eq"] + r = re.compile(regex) + m = r.search(actual) + logging.debug("_runcmd_output_doesnt_match_regex:") + logging.debug(f" actual: {actual!r}") + logging.debug(f" regex: {regex!r}") + logging.debug(f" match: {m}") + assert_eq(m, None) diff --git a/subplot/runcmd.yaml b/subplot/runcmd.yaml new file mode 100644 index 0000000..48dde90 --- /dev/null +++ b/subplot/runcmd.yaml @@ -0,0 +1,83 @@ +# Steps to run commands. + +- given: helper script {filename} for runcmd + function: runcmd_helper_script + +- given: srcdir is in the PATH + function: runcmd_helper_srcdir_path + +- when: I run (?P\S+)(?P.*) + regex: true + function: runcmd_step + +- when: I try to run (?P\S+)(?P.*) + regex: true + function: runcmd_try_to_run + +# Steps to examine exit code of latest command. + +- then: exit code is {exit} + function: runcmd_exit_code_is + +- then: exit code is not {exit} + function: runcmd_exit_code_is_not + +- then: command is successful + function: runcmd_exit_code_is_zero + +- then: command fails + function: runcmd_exit_code_is_nonzero + +# Steps to examine stdout/stderr for exact content. + +- then: stdout is exactly "(?P.*)" + regex: true + function: runcmd_stdout_is + +- then: "stdout isn't exactly \"(?P.*)\"" + regex: true + function: runcmd_stdout_isnt + +- then: stderr is exactly "(?P.*)" + regex: true + function: runcmd_stderr_is + +- then: "stderr isn't exactly \"(?P.*)\"" + regex: true + function: runcmd_stderr_isnt + +# Steps to examine stdout/stderr for sub-strings. + +- then: stdout contains "(?P.*)" + regex: true + function: runcmd_stdout_contains + +- then: "stdout doesn't contain \"(?P.*)\"" + regex: true + function: runcmd_stdout_doesnt_contain + +- then: stderr contains "(?P.*)" + regex: true + function: runcmd_stderr_contains + +- then: "stderr doesn't contain \"(?P.*)\"" + regex: true + function: runcmd_stderr_doesnt_contain + +# Steps to match stdout/stderr against regular expressions. + +- then: stdout matches regex (?P.*) + regex: true + function: runcmd_stdout_matches_regex + +- then: stdout doesn't match regex (?P.*) + regex: true + function: runcmd_stdout_doesnt_match_regex + +- then: stderr matches regex (?P.*) + regex: true + function: runcmd_stderr_matches_regex + +- then: stderr doesn't match regex (?P.*) + regex: true + function: runcmd_stderr_doesnt_match_regex diff --git a/subplot/summain.py b/subplot/summain.py new file mode 100644 index 0000000..3ea6188 --- /dev/null +++ b/subplot/summain.py @@ -0,0 +1,36 @@ +import os + + +def install_summain(ctx): + runcmd_prepend_to_path = globals()["runcmd_prepend_to_path"] + srcdir = globals()["srcdir"] + bindir = os.path.join(srcdir, "target", "debug") + runcmd_prepend_to_path(ctx, dirname=bindir) + + +def create_directory(ctx, dirname=None): + os.mkdir(dirname) + + +def create_file(ctx, filename=None): + open(filename, "w").close() + + +def set_atime(ctx, filename=None, timestamp=None): + st = os.lstat(filename) + os.utime(filename, (int(timestamp), int(st.st_mtime))) + + +def set_mtime(ctx, filename=None, timestamp=None): + st = os.lstat(filename) + os.utime(filename, (int(st.st_atime), int(timestamp))) + + +def output_matches_file(ctx, filename=None): + runcmd_get_stdout = globals()["runcmd_get_stdout"] + get_file = globals()["get_file"] + assert_eq = globals()["assert_eq"] + + actual = runcmd_get_stdout(ctx) + expected = get_file(filename).decode("UTF-8") + assert_eq(actual, expected) diff --git a/subplot/summain.yaml b/subplot/summain.yaml new file mode 100644 index 0000000..31bc28c --- /dev/null +++ b/subplot/summain.yaml @@ -0,0 +1,17 @@ +- given: an installed summain + function: install_summain + +- given: directory {dirname} + function: create_directory + +- given: file {filename} + function: create_file + +- given: atime for {filename} is {timestamp} + function: set_atime + +- given: mtime for {filename} is {timestamp} + function: set_mtime + +- then: output matches file {filename} + function: output_matches_file diff --git a/summain.md b/summain.md new file mode 100644 index 0000000..7c2c5fa --- /dev/null +++ b/summain.md @@ -0,0 +1,187 @@ +# Introduction + +A file manifest lists files, with their metadata. + +To verify a backup has been restored correctly, one can compare a +manifest of the data before the backup and after it has been restored. +If the manifests are identical, the data has been restored correctly. + +This requires a way to produce manifests that is deterministic: if run +twice on the same input files, without the files having changed, the +result should be identical. The Summain program does this. + +This version of Summain has been written in Rust for the [Obnam][] +project. + +[Obnam]: https://obnam.org/ + +## Why not mtree? + +[mtree]: http://cdn.netbsd.org/pub/pkgsrc/current/pkgsrc/pkgtools/mtree/README.html +[NetBSD]: https://en.wikipedia.org/wiki/NetBSD + +[mtree][] is a tool included in [NetBSD][] Unix since version 1.2, +released in 1996. It produces a manifest, and can check a manifest +against the file system. It is, in principle, a tool that solves the +same problem Summain. Why not use an existing tool. Some reasons: + +* I'm an anti-social not-invented-here jerk. +* It's an old C program, without tests in the source tree. +* The file format is custom, and not nice for reading by humans. +* It doesn't handle Unicode well. + - a filename of `รถ` is encoded as `\M-C\M-6` + - but at least it can handle non-ASCII characters! +* It doesn't handle file metadata that's Linux specific. + - extended attributes + - the ext4 immutable bit +* It's single-threaded. + +In principle, there is no reason why mtree couldn't be extended to +support everything I need for Obnam. In practice, since I'm working on +this in my free time in order to have fun, I prefer to write a new +tool in Rust. + + +## Why not use the old Python version of Summain + +I don't like Python anymore. The old tool would need updates to work +with current Python, and I'd rather use Rust. + + +# Usage + +Summain is given one or more files or directories on the command line, +and it outputs to its standard output a manifest. If the command line +arguments are the same, and the files haven't changed, the manifest is +the same. + +The output is YAML. Each file gets its own YAML document, delimieted +by `---` and `...` as usual. + +Summain does not itself traverse directories. Instead, a tool like +**find**(1) should be used. Summain will, however, sort its command +line arguments so that it doesn't matter if they're always in the same +order. + +# Acceptance criteria + +## Directory + +~~~scenario +given an installed summain +given directory empty +and atime for empty is 123 +and mtime for empty is 456 +when I run chmod a=rx empty +when I run summain empty +then output matches file empty.yaml +~~~ + +```{#empty.yaml .file .numberLines} +--- +path: empty +atime: 123 +atime_nsec: 0 +mode: dr-xr-xr-x +mtime: 456 +mtime_nsec: 0 +nlink: 2 +size: ~ +``` + +## Writeable file + + +~~~scenario +given an installed summain +given file foo +and atime for foo is 11 +and mtime for foo is 22 +when I run chmod a=rw foo +when I run summain foo +then output matches file foo.yaml +~~~ + +```{#foo.yaml .file .numberLines} +--- +path: foo +atime: 11 +atime_nsec: 0 +mode: "-rw-rw-rw-" +mtime: 22 +mtime_nsec: 0 +nlink: 1 +size: 0 +``` + +## Read-only file + +~~~scenario +given an installed summain +given file foo +and atime for foo is 33 +and mtime for foo is 44 +when I run chmod a=r foo +when I run summain foo +then output matches file readonly.yaml +~~~ + +```{#readonly.yaml .file .numberLines} +--- +path: foo +atime: 33 +atime_nsec: 0 +mode: "-r--r--r--" +mtime: 44 +mtime_nsec: 0 +nlink: 1 +size: 0 +``` + +## Two files sorted + +~~~scenario +given an installed summain +given file aaa +and atime for aaa is 33 +and mtime for aaa is 44 +given file bbb +and atime for bbb is 33 +and mtime for bbb is 44 +when I run chmod a=r aaa bbb +when I run summain bbb aaa +then output matches file aaabbb.yaml +~~~ + +```{#aaabbb.yaml .file .numberLines} +--- +path: aaa +atime: 33 +atime_nsec: 0 +mode: "-r--r--r--" +mtime: 44 +mtime_nsec: 0 +nlink: 1 +size: 0 +--- +path: bbb +atime: 33 +atime_nsec: 0 +mode: "-r--r--r--" +mtime: 44 +mtime_nsec: 0 +nlink: 1 +size: 0 +``` + +--- +title: "Summain—deterministic file manifests" +author: Lars Wirzenius +template: python +bindings: + - subplot/summain.yaml + - subplot/runcmd.yaml +functions: + - subplot/summain.py + - subplot/runcmd.py +... -- cgit v1.2.1