From aa6709b2f657a964c2af04f6370e7eb7c117bf9e Mon Sep 17 00:00:00 2001
From: Lars Wirzenius <liw@liw.fi>
Date: Fri, 27 Nov 2020 16:57:21 +0200
Subject: feat: implement Summain in Rust

---
 .gitignore           |   5 +
 Cargo.toml           |   5 +
 check                |  35 +++++++
 src/lib.rs           |   1 +
 src/main.rs          |  70 +++++++++++++-
 subplot/runcmd.py    | 252 +++++++++++++++++++++++++++++++++++++++++++++++++++
 subplot/runcmd.yaml  |  83 +++++++++++++++++
 subplot/summain.py   |  36 ++++++++
 subplot/summain.yaml |  17 ++++
 summain.md           | 187 ++++++++++++++++++++++++++++++++++++++
 10 files changed, 689 insertions(+), 2 deletions(-)
 create mode 100755 check
 create mode 100644 src/lib.rs
 create mode 100644 subplot/runcmd.py
 create mode 100644 subplot/runcmd.yaml
 create mode 100644 subplot/summain.py
 create mode 100644 subplot/summain.yaml
 create mode 100644 summain.md
diff --git a/.gitignore b/.gitignore
index ea8c4bf..92aa39b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,6 @@
 /target
+Cargo.lock
+summain.html
+summain.pdf
+test.log
+test.py
diff --git a/Cargo.toml b/Cargo.toml
index 61ed129..b570500 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,3 +7,8 @@ edition = "2018"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+anyhow = "1"
+serde = { version = "1", features = ["derive"] }
+serde_yaml = "0.8"
+structopt = "0.3"
+unix_mode = "0.1"
diff --git a/check b/check
new file mode 100755
index 0000000..0445bf5
--- /dev/null
+++ b/check
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# Run automated tests for the project.
+
+set -euo pipefail
+
+quiet=-q
+hideok=chronic
+if [ "$#" -gt 0 ]
+then
+    case "$1" in
+	verbose | -v | --verbose)
+	    quiet=
+	    hideok=
+	    ;;
+    esac
+fi
+
+got_cargo_cmd()
+{
+    cargo --list | grep " $1 " > /dev/null
+}
+
+cargo build --all-targets $quiet
+got_cargo_cmd clippy && cargo clippy $quiet
+got_cargo_cmd fmt && cargo fmt -- --check
+
+sp-docgen summain.md -o summain.html
+sp-docgen summain.md -o summain.pdf
+
+sp-codegen summain.md -o test.py
+rm -f test.log
+$hideok python3 test.py --log test.log "$@"
+
+echo "Everything seems to be in order."
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1 @@
+
diff --git a/src/main.rs b/src/main.rs
index e7a11a9..fd23eb8 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,69 @@
-fn main() {
-    println!("Hello, world!");
+use anyhow::Context;
+use serde::Serialize;
+use std::fs::symlink_metadata;
+use std::fs::Metadata;
+use std::os::linux::fs::MetadataExt;
+use std::path::{Path, PathBuf};
+use structopt::StructOpt;
+
+fn main() -> anyhow::Result<()> {
+    let mut opt = Opt::from_args();
+    opt.pathnames[..].sort();
+    for pathname in opt.pathnames {
+        report(&pathname).with_context(|| format!("{}", pathname.display()))?
+    }
+    Ok(())
+}
+
+#[derive(StructOpt, Debug)]
+struct Opt {
+    #[structopt(parse(from_os_str))]
+    pathnames: Vec<PathBuf>,
+}
+
+#[derive(Serialize, Debug)]
+struct Entry {
+    path: PathBuf,
+    atime: i64,
+    atime_nsec: i64,
+    #[serde(with = "mode")]
+    mode: u32,
+    mtime: i64,
+    mtime_nsec: i64,
+    nlink: u64,
+    size: Option<u64>,
+}
+
+impl Entry {
+    fn new(path: &Path, m: Metadata) -> Self {
+        Self {
+            path: path.to_path_buf(),
+            atime: m.st_atime(),
+            atime_nsec: m.st_atime_nsec(),
+            mode: m.st_mode(),
+            mtime: m.st_mtime(),
+            mtime_nsec: m.st_mtime_nsec(),
+            nlink: m.st_nlink(),
+            size: if m.is_dir() { None } else { Some(m.st_size()) },
+        }
+    }
+}
+
+fn report(pathname: &Path) -> anyhow::Result<()> {
+    let m = symlink_metadata(pathname)?;
+    let e = Entry::new(pathname, m);
+    println!("{}", serde_yaml::to_string(&e)?);
+    Ok(())
+}
+
+mod mode {
+    use serde::{self, Serializer};
+
+    pub fn serialize<S>(mode: &u32, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let s = unix_mode::to_string(*mode);
+        serializer.serialize_str(&s)
+    }
 }
diff --git a/subplot/runcmd.py b/subplot/runcmd.py
new file mode 100644
index 0000000..a2564c6
--- /dev/null
+++ b/subplot/runcmd.py
@@ -0,0 +1,252 @@
+import logging
+import os
+import re
+import shlex
+import subprocess
+
+
+#
+# Helper functions.
+#
+
+# Get exit code or other stored data about the latest command run by
+# runcmd_run.
+
+
+def _runcmd_get(ctx, name):
+    ns = ctx.declare("_runcmd")
+    return ns[name]
+
+
+def runcmd_get_exit_code(ctx):
+    return _runcmd_get(ctx, "exit")
+
+
+def runcmd_get_stdout(ctx):
+    return _runcmd_get(ctx, "stdout")
+
+
+def runcmd_get_stdout_raw(ctx):
+    return _runcmd_get(ctx, "stdout.raw")
+
+
+def runcmd_get_stderr(ctx):
+    return _runcmd_get(ctx, "stderr")
+
+
+def runcmd_get_stderr_raw(ctx):
+    return _runcmd_get(ctx, "stderr.raw")
+
+
+def runcmd_get_argv(ctx):
+    return _runcmd_get(ctx, "argv")
+
+
+# Run a command, given an argv and other arguments for subprocess.Popen.
+#
+# This is meant to be a helper function, not bound directly to a step. The
+# stdout, stderr, and exit code are stored in the "_runcmd" namespace in the
+# ctx context.
+def runcmd_run(ctx, argv, **kwargs):
+    ns = ctx.declare("_runcmd")
+
+    # The Subplot Python template empties os.environ at startup, modulo a small
+    # number of variables with carefully chosen values. Here, we don't need to
+    # care about what those variables are, but we do need to not overwrite
+    # them, so we just add anything in the env keyword argument, if any, to
+    # os.environ.
+    env = dict(os.environ)
+    for key, arg in kwargs.pop("env", {}).items():
+        env[key] = arg
+
+    pp = ns.get("path-prefix")
+    if pp:
+        env["PATH"] = pp + ":" + env["PATH"]
+
+    logging.debug(f"runcmd_run")
+    logging.debug(f"  argv: {argv}")
+    logging.debug(f"  env: {env}")
+    p = subprocess.Popen(
+        argv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, **kwargs
+    )
+    stdout, stderr = p.communicate("")
+    ns["argv"] = argv
+    ns["stdout.raw"] = stdout
+    ns["stderr.raw"] = stderr
+    ns["stdout"] = stdout.decode("utf-8")
+    ns["stderr"] = stderr.decode("utf-8")
+    ns["exit"] = p.returncode
+    logging.debug(f"  ctx: {ctx}")
+    logging.debug(f"  ns: {ns}")
+
+
+# Step: prepend srcdir to PATH whenever runcmd runs a command.
+def runcmd_helper_srcdir_path(ctx):
+    srcdir = globals()["srcdir"]
+    runcmd_prepend_to_path(ctx, srcdir)
+
+
+# Step: This creates a helper script.
+def runcmd_helper_script(ctx, filename=None):
+    get_file = globals()["get_file"]
+    with open(filename, "wb") as f:
+        f.write(get_file(filename))
+
+
+#
+# Step functions for running commands.
+#
+
+
+def runcmd_prepend_to_path(ctx, dirname=None):
+    ns = ctx.declare("_runcmd")
+    pp = ns.get("path-prefix", "")
+    if pp:
+        pp = f"{pp}:{dirname}"
+    else:
+        pp = dirname
+    ns["path-prefix"] = pp
+
+
+def runcmd_step(ctx, argv0=None, args=None):
+    runcmd_try_to_run(ctx, argv0=argv0, args=args)
+    runcmd_exit_code_is_zero(ctx)
+
+
+def runcmd_try_to_run(ctx, argv0=None, args=None):
+    argv = [shlex.quote(argv0)] + shlex.split(args)
+    runcmd_run(ctx, argv)
+
+
+#
+# Step functions for examining exit codes.
+#
+
+
+def runcmd_exit_code_is_zero(ctx):
+    runcmd_exit_code_is(ctx, exit=0)
+
+
+def runcmd_exit_code_is(ctx, exit=None):
+    assert_eq = globals()["assert_eq"]
+    assert_eq(runcmd_get_exit_code(ctx), int(exit))
+
+
+def runcmd_exit_code_is_nonzero(ctx):
+    runcmd_exit_code_is_not(ctx, exit=0)
+
+
+def runcmd_exit_code_is_not(ctx, exit=None):
+    assert_ne = globals()["assert_ne"]
+    assert_ne(runcmd_get_exit_code(ctx), int(exit))
+
+
+#
+# Step functions and helpers for examining output in various ways.
+#
+
+
+def runcmd_stdout_is(ctx, text=None):
+    _runcmd_output_is(runcmd_get_stdout(ctx), text)
+
+
+def runcmd_stdout_isnt(ctx, text=None):
+    _runcmd_output_isnt(runcmd_get_stdout(ctx), text)
+
+
+def runcmd_stderr_is(ctx, text=None):
+    _runcmd_output_is(runcmd_get_stderr(ctx), text)
+
+
+def runcmd_stderr_isnt(ctx, text=None):
+    _runcmd_output_isnt(runcmd_get_stderr(ctx), text)
+
+
+def _runcmd_output_is(actual, wanted):
+    assert_eq = globals()["assert_eq"]
+    wanted = bytes(wanted, "utf8").decode("unicode_escape")
+    logging.debug("_runcmd_output_is:")
+    logging.debug(f"  actual: {actual!r}")
+    logging.debug(f"  wanted: {wanted!r}")
+    assert_eq(actual, wanted)
+
+
+def _runcmd_output_isnt(actual, wanted):
+    assert_ne = globals()["assert_ne"]
+    wanted = bytes(wanted, "utf8").decode("unicode_escape")
+    logging.debug("_runcmd_output_isnt:")
+    logging.debug(f"  actual: {actual!r}")
+    logging.debug(f"  wanted: {wanted!r}")
+    assert_ne(actual, wanted)
+
+
+def runcmd_stdout_contains(ctx, text=None):
+    _runcmd_output_contains(runcmd_get_stdout(ctx), text)
+
+
+def runcmd_stdout_doesnt_contain(ctx, text=None):
+    _runcmd_output_doesnt_contain(runcmd_get_stdout(ctx), text)
+
+
+def runcmd_stderr_contains(ctx, text=None):
+    _runcmd_output_contains(runcmd_get_stderr(ctx), text)
+
+
+def runcmd_stderr_doesnt_contain(ctx, text=None):
+    _runcmd_output_doesnt_contain(runcmd_get_stderr(ctx), text)
+
+
+def _runcmd_output_contains(actual, wanted):
+    assert_eq = globals()["assert_eq"]
+    wanted = bytes(wanted, "utf8").decode("unicode_escape")
+    logging.debug("_runcmd_output_contains:")
+    logging.debug(f"  actual: {actual!r}")
+    logging.debug(f"  wanted: {wanted!r}")
+    assert_eq(wanted in actual, True)
+
+
+def _runcmd_output_doesnt_contain(actual, wanted):
+    assert_ne = globals()["assert_ne"]
+    wanted = bytes(wanted, "utf8").decode("unicode_escape")
+    logging.debug("_runcmd_output_doesnt_contain:")
+    logging.debug(f"  actual: {actual!r}")
+    logging.debug(f"  wanted: {wanted!r}")
+    assert_ne(wanted in actual, True)
+
+
+def runcmd_stdout_matches_regex(ctx, regex=None):
+    _runcmd_output_matches_regex(runcmd_get_stdout(ctx), regex)
+
+
+def runcmd_stdout_doesnt_match_regex(ctx, regex=None):
+    _runcmd_output_doesnt_match_regex(runcmd_get_stdout(ctx), regex)
+
+
+def runcmd_stderr_matches_regex(ctx, regex=None):
+    _runcmd_output_matches_regex(runcmd_get_stderr(ctx), regex)
+
+
+def runcmd_stderr_doesnt_match_regex(ctx, regex=None):
+    _runcmd_output_doesnt_match_regex(runcmd_get_stderr(ctx), regex)
+
+
+def _runcmd_output_matches_regex(actual, regex):
+    assert_ne = globals()["assert_ne"]
+    r = re.compile(regex)
+    m = r.search(actual)
+    logging.debug("_runcmd_output_matches_regex:")
+    logging.debug(f"  actual: {actual!r}")
+    logging.debug(f"  regex: {regex!r}")
+    logging.debug(f"  match: {m}")
+    assert_ne(m, None)
+
+
+def _runcmd_output_doesnt_match_regex(actual, regex):
+    assert_eq = globals()["assert_eq"]
+    r = re.compile(regex)
+    m = r.search(actual)
+    logging.debug("_runcmd_output_doesnt_match_regex:")
+    logging.debug(f"  actual: {actual!r}")
+    logging.debug(f"  regex: {regex!r}")
+    logging.debug(f"  match: {m}")
+    assert_eq(m, None)
diff --git a/subplot/runcmd.yaml b/subplot/runcmd.yaml
new file mode 100644
index 0000000..48dde90
--- /dev/null
+++ b/subplot/runcmd.yaml
@@ -0,0 +1,83 @@
+# Steps to run commands.
+
+- given: helper script {filename} for runcmd
+  function: runcmd_helper_script
+
+- given: srcdir is in the PATH
+  function: runcmd_helper_srcdir_path
+
+- when: I run (?P<argv0>\S+)(?P<args>.*)
+  regex: true
+  function: runcmd_step
+
+- when: I try to run (?P<argv0>\S+)(?P<args>.*)
+  regex: true
+  function: runcmd_try_to_run
+
+# Steps to examine exit code of latest command.
+
+- then: exit code is {exit}
+  function: runcmd_exit_code_is
+
+- then: exit code is not {exit}
+  function: runcmd_exit_code_is_not
+
+- then: command is successful
+  function: runcmd_exit_code_is_zero
+
+- then: command fails
+  function: runcmd_exit_code_is_nonzero
+
+# Steps to examine stdout/stderr for exact content.
+
+- then: stdout is exactly "(?P<text>.*)"
+  regex: true
+  function: runcmd_stdout_is
+
+- then: "stdout isn't exactly \"(?P<text>.*)\""
+  regex: true
+  function: runcmd_stdout_isnt
+
+- then: stderr is exactly "(?P<text>.*)"
+  regex: true
+  function: runcmd_stderr_is
+
+- then: "stderr isn't exactly \"(?P<text>.*)\""
+  regex: true
+  function: runcmd_stderr_isnt
+
+# Steps to examine stdout/stderr for sub-strings.
+
+- then: stdout contains "(?P<text>.*)"
+  regex: true
+  function: runcmd_stdout_contains
+
+- then: "stdout doesn't contain \"(?P<text>.*)\""
+  regex: true
+  function: runcmd_stdout_doesnt_contain
+
+- then: stderr contains "(?P<text>.*)"
+  regex: true
+  function: runcmd_stderr_contains
+
+- then: "stderr doesn't contain \"(?P<text>.*)\""
+  regex: true
+  function: runcmd_stderr_doesnt_contain
+
+# Steps to match stdout/stderr against regular expressions.
+
+- then: stdout matches regex (?P<regex>.*)
+  regex: true
+  function: runcmd_stdout_matches_regex
+
+- then: stdout doesn't match regex (?P<regex>.*)
+  regex: true
+  function: runcmd_stdout_doesnt_match_regex
+
+- then: stderr matches regex (?P<regex>.*)
+  regex: true
+  function: runcmd_stderr_matches_regex
+
+- then: stderr doesn't match regex (?P<regex>.*)
+  regex: true
+  function: runcmd_stderr_doesnt_match_regex
diff --git a/subplot/summain.py b/subplot/summain.py
new file mode 100644
index 0000000..3ea6188
--- /dev/null
+++ b/subplot/summain.py
@@ -0,0 +1,36 @@
+import os
+
+
+def install_summain(ctx):
+    runcmd_prepend_to_path = globals()["runcmd_prepend_to_path"]
+    srcdir = globals()["srcdir"]
+    bindir = os.path.join(srcdir, "target", "debug")
+    runcmd_prepend_to_path(ctx, dirname=bindir)
+
+
+def create_directory(ctx, dirname=None):
+    os.mkdir(dirname)
+
+
+def create_file(ctx, filename=None):
+    open(filename, "w").close()
+
+
+def set_atime(ctx, filename=None, timestamp=None):
+    st = os.lstat(filename)
+    os.utime(filename, (int(timestamp), int(st.st_mtime)))
+
+
+def set_mtime(ctx, filename=None, timestamp=None):
+    st = os.lstat(filename)
+    os.utime(filename, (int(st.st_atime), int(timestamp)))
+
+
+def output_matches_file(ctx, filename=None):
+    runcmd_get_stdout = globals()["runcmd_get_stdout"]
+    get_file = globals()["get_file"]
+    assert_eq = globals()["assert_eq"]
+
+    actual = runcmd_get_stdout(ctx)
+    expected = get_file(filename).decode("UTF-8")
+    assert_eq(actual, expected)
diff --git a/subplot/summain.yaml b/subplot/summain.yaml
new file mode 100644
index 0000000..31bc28c
--- /dev/null
+++ b/subplot/summain.yaml
@@ -0,0 +1,17 @@
+- given: an installed summain
+  function: install_summain
+
+- given: directory {dirname}
+  function: create_directory
+
+- given: file {filename}
+  function: create_file
+
+- given: atime for {filename} is {timestamp}
+  function: set_atime
+
+- given: mtime for {filename} is {timestamp}
+  function: set_mtime
+
+- then: output matches file {filename}
+  function: output_matches_file
diff --git a/summain.md b/summain.md
new file mode 100644
index 0000000..7c2c5fa
--- /dev/null
+++ b/summain.md
@@ -0,0 +1,187 @@
+# Introduction
+
+A file manifest lists files, with their metadata.
+
+To verify a backup has been restored correctly, one can compare a
+manifest of the data before the backup and after it has been restored.
+If the manifests are identical, the data has been restored correctly.
+
+This requires a way to produce manifests that is deterministic: if run
+twice on the same input files, without the files having changed, the
+result should be identical. The Summain program does this.
+
+This version of Summain has been written in Rust for the [Obnam][]
+project.
+
+[Obnam]: https://obnam.org/
+
+## Why not mtree?
+
+[mtree]: http://cdn.netbsd.org/pub/pkgsrc/current/pkgsrc/pkgtools/mtree/README.html
+[NetBSD]: https://en.wikipedia.org/wiki/NetBSD
+
+[mtree][] is a tool included in [NetBSD][] Unix since version 1.2,
+released in 1996. It produces a manifest, and can check a manifest
+against the file system. It is, in principle, a tool that solves the
+same problem Summain. Why not use an existing tool. Some reasons:
+
+* I'm an anti-social not-invented-here jerk.
+* It's an old C program, without tests in the source tree.
+* The file format is custom, and not nice for reading by humans.
+* It doesn't handle Unicode well.
+  - a filename of `ö` is encoded as `\M-C\M-6`
+  - but at least it can handle non-ASCII characters!
+* It doesn't handle file metadata that's Linux specific.
+  - extended attributes
+  - the ext4 immutable bit
+* It's single-threaded.
+
+In principle, there is no reason why mtree couldn't be extended to
+support everything I need for Obnam. In practice, since I'm working on
+this in my free time in order to have fun, I prefer to write a new
+tool in Rust.
+
+
+## Why not use the old Python version of Summain
+
+I don't like Python anymore. The old tool would need updates to work
+with current Python, and I'd rather use Rust.
+
+
+# Usage
+
+Summain is given one or more files or directories on the command line,
+and it outputs to its standard output a manifest. If the command line
+arguments are the same, and the files haven't changed, the manifest is
+the same.
+
+The output is YAML. Each file gets its own YAML document, delimieted
+by `---` and `...` as usual.
+
+Summain does not itself traverse directories. Instead, a tool like
+**find**(1) should be used. Summain will, however, sort its command
+line arguments so that it doesn't matter if they're always in the same
+order.
+
+# Acceptance criteria
+
+## Directory
+
+~~~scenario
+given an installed summain
+given directory empty
+and atime for empty is 123
+and mtime for empty is 456
+when I run chmod a=rx empty
+when I run summain empty
+then output matches file empty.yaml
+~~~
+
+```{#empty.yaml .file .numberLines}
+---
+path: empty
+atime: 123
+atime_nsec: 0
+mode: dr-xr-xr-x
+mtime: 456
+mtime_nsec: 0
+nlink: 2
+size: ~
+```
+
+## Writeable file
+
+
+~~~scenario
+given an installed summain
+given file foo
+and atime for foo is 11
+and mtime for foo is 22
+when I run chmod a=rw foo
+when I run summain foo
+then output matches file foo.yaml
+~~~
+
+```{#foo.yaml .file .numberLines}
+---
+path: foo
+atime: 11
+atime_nsec: 0
+mode: "-rw-rw-rw-"
+mtime: 22
+mtime_nsec: 0
+nlink: 1
+size: 0
+```
+
+## Read-only file
+
+~~~scenario
+given an installed summain
+given file foo
+and atime for foo is 33
+and mtime for foo is 44
+when I run chmod a=r foo
+when I run summain foo
+then output matches file readonly.yaml
+~~~
+
+```{#readonly.yaml .file .numberLines}
+---
+path: foo
+atime: 33
+atime_nsec: 0
+mode: "-r--r--r--"
+mtime: 44
+mtime_nsec: 0
+nlink: 1
+size: 0
+```
+
+## Two files sorted
+
+~~~scenario
+given an installed summain
+given file aaa
+and atime for aaa is 33
+and mtime for aaa is 44
+given file bbb
+and atime for bbb is 33
+and mtime for bbb is 44
+when I run chmod a=r aaa bbb
+when I run summain bbb aaa
+then output matches file aaabbb.yaml
+~~~
+
+```{#aaabbb.yaml .file .numberLines}
+---
+path: aaa
+atime: 33
+atime_nsec: 0
+mode: "-r--r--r--"
+mtime: 44
+mtime_nsec: 0
+nlink: 1
+size: 0
+---
+path: bbb
+atime: 33
+atime_nsec: 0
+mode: "-r--r--r--"
+mtime: 44
+mtime_nsec: 0
+nlink: 1
+size: 0
+```
+
+---
+title: "Summain&mdash;deterministic file manifests"
+author: Lars Wirzenius
+template: python
+bindings:
+  - subplot/summain.yaml
+  - subplot/runcmd.yaml
+functions:
+  - subplot/summain.py
+  - subplot/runcmd.py
+...
-- 
cgit v1.2.1