summaryrefslogtreecommitdiff
path: root/src/ast.rs
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2021-09-14 14:14:38 +0300
committerLars Wirzenius <liw@liw.fi>2021-09-16 16:40:18 +0300
commit458525d21c9960d98fa932be06cf133cd910ce7b (patch)
tree08f3d913e388c5870231b757f64ed0fe157ba105 /src/ast.rs
parent9198b69c561e582704dbd99000d59e474c406a67 (diff)
downloadsubplot-458525d21c9960d98fa932be06cf133cd910ce7b.tar.gz
feat! parse Markdown input with pull-cmark instead of Pandoc
This is a first step towards being able to use Subplot codegen from a project's build.rs, and with only pure-Rust build dependencies. Replace Pandoc for parsing Markdown input with pulldown-cmark. This is mostly a drop-in replacement, but not entirely. The YAML parsing is more strict now. Note that this is a breaking change. Some subplots that used to work, and still work with docgen, no longer work with the new parser. Major differences are: * Only specific fields are supported. All the Markdown files in the Subplot source tree work. If anything else is needed, and it likely is, the new parser needs to be extended. * The bindings, functions, classes, and bibliography fields MUST be lists of strings. A single string value will no longer work. Sponsored-by: pep.foundation
Diffstat (limited to 'src/ast.rs')
-rw-r--r--src/ast.rs455
1 files changed, 455 insertions, 0 deletions
diff --git a/src/ast.rs b/src/ast.rs
new file mode 100644
index 0000000..60d234d
--- /dev/null
+++ b/src/ast.rs
@@ -0,0 +1,455 @@
+use lazy_static::lazy_static;
+use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc};
+use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
+use regex::Regex;
+use serde::Deserialize;
+use std::path::{Path, PathBuf};
+use tracing::{event, span, Level};
+
+lazy_static! {
+ // Pattern that recognises a YAML block at the beginning of a file.
+ static ref LEADING_YAML_PATTERN: Regex = Regex::new(r"^(?:\S*\n)*-{3,}\n(?P<yaml>([^.]+.*?\n)*)\.{3,}\n(?P<text>(.*\n)*)$").unwrap();
+
+
+ // Pattern that recognises a YAML block at the beginning of a file.
+ static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*-{3,}\n(?P<yaml>(.*?\n)*)\.{3,}\n(?:\S*\n)*$").unwrap();
+}
+
+/// An abstract syntax tree representation of a Markdown file.
+///
+/// This represents a Markdown file as an abstract syntax tree
+/// compatible with Pandoc's AST. The document YAML metadata MUST be
+/// at the top or bottom of the file, excluding leading or trailing
+/// empty lines.
+#[derive(Debug)]
+pub struct AbstractSyntaxTree {
+ blocks: Vec<Block>,
+ meta: Map<String, MetaValue>,
+}
+
+impl AbstractSyntaxTree {
+ // Create a new AST.
+ //
+ // Note that this is not public.
+ fn new(meta: Map<String, MetaValue>, blocks: Vec<Block>) -> Self {
+ Self { blocks, meta }
+ }
+
+ /// Return a Pandoc-compatible AST.
+ pub fn to_pandoc(&self) -> Pandoc {
+ Pandoc {
+ meta: self.meta.clone(),
+ blocks: self.blocks.clone(),
+ pandoc_api_version: vec![1, 20],
+ }
+ }
+}
+
+impl std::str::FromStr for AbstractSyntaxTree {
+ type Err = Error;
+
+ /// Create an abstract syntax tree from a string.
+ fn from_str(markdown: &str) -> Result<Self, Self::Err> {
+ let span = span!(Level::TRACE, "parse markdown");
+ let _ = span.enter();
+ event!(Level::TRACE, "Parsing markdown");
+ let ast = if let Some((yaml, markdown)) = get_yaml(&LEADING_YAML_PATTERN, markdown) {
+ event!(Level::TRACE, ?yaml, "Found leading YAML");
+ let meta = Metadata::new(yaml)?.to_map();
+ let blocks = parse_blocks(markdown);
+ AbstractSyntaxTree::new(meta, blocks)
+ } else if let Some((yaml, _markdown)) = get_yaml(&TRAILING_YAML_PATTERN, markdown) {
+ event!(Level::TRACE, ?yaml, "Found trailing YAML");
+ let meta = Metadata::new(yaml)?.to_map();
+ let blocks = parse_blocks(markdown);
+ AbstractSyntaxTree::new(meta, blocks)
+ } else {
+ event!(Level::TRACE, "No YAML to be found");
+ let blocks = parse_blocks(markdown);
+ AbstractSyntaxTree::new(Map::new(), blocks)
+ };
+ event!(Level::TRACE, "Parsing markdown: OK");
+ Ok(ast)
+ }
+}
+
+// Extract a YAML metadata block using a given regex.
+fn get_yaml<'a>(pat: &Regex, markdown: &'a str) -> Option<(&'a str, &'a str)> {
+ if let Some(c) = pat.captures(markdown) {
+ event!(Level::TRACE, "YAML regex matches");
+ let yaml = c.name("yaml");
+ let text = c.name("text");
+ if yaml.is_some() && text.is_some() {
+ event!(Level::TRACE, "YAML regex captures YAML and text");
+ let yaml = &markdown[yaml?.start()..yaml?.end()];
+
+ let text = &markdown[text?.start()..text?.end()];
+ return Some((yaml, text));
+ } else {
+ event!(Level::TRACE, ?c, "YAML regex fails to capture YAML");
+ }
+ } else {
+ event!(Level::TRACE, ?pat, "YAML regex does not match");
+ }
+ None
+}
+
+// Parse Markdown into a sequence of Blocks.
+fn parse_blocks(markdown: &str) -> Vec<Block> {
+ event!(Level::TRACE, "Parsing blocks");
+
+ // Define the Markdown parser.
+ let mut options = Options::empty();
+ options.insert(Options::ENABLE_TABLES);
+ options.insert(Options::ENABLE_FOOTNOTES);
+ options.insert(Options::ENABLE_STRIKETHROUGH);
+ options.insert(Options::ENABLE_TASKLISTS);
+ options.insert(Options::ENABLE_SMART_PUNCTUATION);
+ let parser = Parser::new_ext(markdown, options);
+
+ // The sequence of blocks that represents the parsed document.
+ let mut blocks = vec![];
+
+ // The current set of inline elements we've collected. This gets
+ // emptied whenever we finish a block.
+ let mut inlines: Vec<Inline> = vec![];
+
+ for event in parser {
+ event!(Level::TRACE, ?event);
+ match event {
+ // We ignore these for now. They're not needed for codegen.
+ Event::Html(_)
+ | Event::FootnoteReference(_)
+ | Event::SoftBreak
+ | Event::HardBreak
+ | Event::Rule
+ | Event::TaskListMarker(_) => (),
+
+ // Inline text of various kinds.
+ Event::Text(text) => inlines.push(inline_text(&text)),
+ Event::Code(text) => inlines.push(inline_code(&text)),
+
+ // We only handle the end events.
+ Event::Start(_) => (),
+
+ // End of a block or inline.
+ Event::End(tag) => match tag {
+ // Collect inline elements for later inclusion in a block.
+ Tag::Emphasis | Tag::Strong | Tag::Strikethrough => {
+ inline_from_inlines(&tag, &mut inlines)
+ }
+ Tag::Paragraph => blocks.push(paragraph(&mut inlines)),
+ Tag::Heading(level) => blocks.push(heading(level as i64, &mut inlines)),
+ Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)),
+
+ // We don't handle anything else yet.
+ _ => (),
+ },
+ }
+ }
+
+ // We MUST have emptied all inline elements.
+ // assert!(inlines.is_empty());
+
+ event!(Level::TRACE, "Parsing blocks: OK");
+ blocks
+}
+
+fn inline_text(text: &str) -> Inline {
+ Inline::Str(text.to_string())
+}
+
+fn inline_code(text: &str) -> Inline {
+ let attr = ("".to_string(), vec![], vec![]);
+ Inline::Code(attr, text.to_string())
+}
+
+fn paragraph(inlines: &mut Vec<Inline>) -> Block {
+ Block::Para(std::mem::take(inlines))
+}
+
+fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block {
+ let attr = ("".to_string(), vec![], vec![]);
+ Block::Header(level, attr, std::mem::take(inlines))
+}
+
+fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block {
+ event!(Level::TRACE, ?kind, "code block");
+ let attr = if let CodeBlockKind::Fenced(lang) = kind {
+ event!(Level::TRACE, ?lang, "fenced code block");
+ parse_code_block_attrs(lang)
+ } else {
+ event!(Level::TRACE, "indented code block");
+ parse_code_block_attrs("")
+ };
+ event!(Level::TRACE, ?attr, "code block attrs");
+ let mut code = String::new();
+ for inline in inlines.drain(0..) {
+ let text = plain_text_inline(inline);
+ code.push_str(&text);
+ }
+ Block::CodeBlock(attr, code)
+}
+
+fn plain_text_inline(inline: Inline) -> String {
+ match inline {
+ Inline::Str(text) => text,
+ Inline::Code(_, text) => text,
+ Inline::Emph(inlines) => {
+ let mut text = String::new();
+ for inline in inlines {
+ text.push_str(&plain_text_inline(inline));
+ }
+ text
+ }
+ _ => panic!("not text in code block: {:?}", inline),
+ }
+}
+
+fn parse_code_block_attrs(attrs: &str) -> Attr {
+ event!(Level::TRACE, ?attrs, "parsing code block attrs");
+ let mut id = "".to_string();
+ let mut classes = vec![];
+ let mut keyvalues = vec![];
+ if attrs.starts_with('{') && attrs.ends_with('}') {
+ let attrs = &attrs[1..attrs.len() - 1];
+ for word in attrs.split_ascii_whitespace() {
+ if let Some(x) = word.strip_prefix('#') {
+ id = x.to_string();
+ } else if let Some(x) = word.strip_prefix('.') {
+ classes.push(x.to_string());
+ } else if let Some((k, v)) = word.split_once('=') {
+ keyvalues.push((k.to_string(), v.to_string()));
+ }
+ }
+ } else {
+ id = attrs.to_string();
+ }
+ (id, classes, keyvalues)
+}
+
+fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) {
+ let new_inlines = inlines.clone();
+ inlines.clear();
+
+ let inline = match tag {
+ Tag::Emphasis => Inline::Emph(new_inlines),
+ Tag::Strong => Inline::Strong(new_inlines),
+ Tag::Strikethrough => Inline::Strikeout(new_inlines),
+ _ => unreachable!(),
+ };
+
+ inlines.push(inline);
+}
+
+/// Errors from Markdown parsing.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+ #[error(transparent)]
+ Regex(#[from] regex::Error),
+
+ #[error(transparent)]
+ Yaml(#[from] serde_yaml::Error),
+}
+
+// Document metadata.
+//
+// This is expressed in the Markdown input file as an embedded YAML
+// block.
+//
+// Note that this structure needs to be able to capture any metadata
+// block we can work with, in any input file. By being strict here we
+// make it easier to tell the user when a metadata block has, say, a
+// misspelled field.
+#[derive(Debug, Default, Deserialize)]
+#[serde(deny_unknown_fields)]
+struct Metadata {
+ title: String,
+ subtitle: Option<String>,
+ author: Option<String>,
+ date: Option<String>,
+ classes: Option<Vec<String>>,
+ template: Option<String>,
+ bibliography: Option<Vec<PathBuf>>,
+ bindings: Option<Vec<PathBuf>>,
+ functions: Option<Vec<PathBuf>>,
+}
+
+impl Metadata {
+ fn new(yaml_text: &str) -> Result<Self, Error> {
+ event!(Level::TRACE, "Parsing YAML");
+ let meta: Self = serde_yaml::from_str(yaml_text)?;
+ Ok(meta)
+ }
+
+ fn to_map(&self) -> Map<String, MetaValue> {
+ event!(Level::TRACE, "Creating metadata map from parsed YAML");
+ let mut map: Map<String, MetaValue> = Map::new();
+ map.insert(s("title"), meta_string(&self.title));
+ if let Some(v) = &self.subtitle {
+ map.insert(s("subtitle"), meta_string(v));
+ }
+ if let Some(v) = &self.author {
+ map.insert(s("author"), meta_string(v));
+ }
+ if let Some(v) = &self.date {
+ map.insert(s("date"), meta_string(v));
+ }
+ if let Some(v) = &self.classes {
+ map.insert(s("classes"), meta_strings(v));
+ }
+ if let Some(v) = &self.template {
+ map.insert(s("template"), meta_string(v));
+ }
+ if let Some(v) = &self.bibliography {
+ map.insert(s("bibliographies"), meta_path_bufs(v));
+ }
+ if let Some(v) = &self.bindings {
+ map.insert(s("bindings"), meta_path_bufs(v));
+ }
+ if let Some(v) = &self.functions {
+ map.insert(s("functions"), meta_path_bufs(v));
+ }
+ event!(
+ Level::TRACE,
+ ?self,
+ ?map,
+ "Created metadata map from parsed YAML"
+ );
+ map
+ }
+}
+
+fn s(s: &str) -> String {
+ s.to_string()
+}
+
+fn meta_string(s: &str) -> MetaValue {
+ MetaValue::MetaString(s.to_string())
+}
+
+fn meta_strings(v: &[String]) -> MetaValue {
+ MetaValue::MetaList(v.iter().map(|s| meta_string(s)).collect())
+}
+
+fn meta_path_buf(p: &Path) -> MetaValue {
+ meta_string(&p.display().to_string())
+}
+
+fn meta_path_bufs(v: &[PathBuf]) -> MetaValue {
+ MetaValue::MetaList(v.iter().map(|p| meta_path_buf(p)).collect())
+}
+
+#[cfg(test)]
+mod test {
+ use super::{parse_code_block_attrs, AbstractSyntaxTree, Metadata};
+ use super::{Block, Inline};
+ use std::path::PathBuf;
+ use std::str::FromStr;
+
+ #[test]
+ fn code_block_attrs() {
+ assert_eq!(
+ parse_code_block_attrs("foo"),
+ ("foo".to_string(), vec![], vec![])
+ );
+ assert_eq!(
+ parse_code_block_attrs("{#foo}"),
+ ("foo".to_string(), vec![], vec![])
+ );
+ assert_eq!(
+ parse_code_block_attrs("{#foo .file bar=yo}"),
+ (
+ "foo".to_string(),
+ vec!["file".to_string()],
+ vec![("bar".to_string(), "yo".to_string())]
+ )
+ );
+ }
+
+ #[test]
+ fn empty_input() {
+ let ast = AbstractSyntaxTree::from_str("").unwrap();
+ let doc = ast.to_pandoc();
+ assert!(doc.blocks.is_empty());
+ assert!(doc.meta.is_empty());
+ assert!(!doc.pandoc_api_version.is_empty());
+ }
+
+ #[test]
+ fn simple() {
+ let ast = AbstractSyntaxTree::from_str(
+ "\
+ # Introduction \n\
+ \n\
+ First paragraph.\n\
+ ",
+ )
+ .unwrap();
+ let doc = ast.to_pandoc();
+ assert!(doc.meta.is_empty());
+ assert!(!doc.pandoc_api_version.is_empty());
+
+ let attr = ("".to_string(), vec![], vec![]);
+ let h = Block::Header(1, attr, vec![Inline::Str("Introduction".to_string())]);
+ let para = Block::Para(vec![Inline::Str("First paragraph.".to_string())]);
+ assert_eq!(doc.blocks, &[h, para]);
+ }
+
+ #[test]
+ fn parses_leading_meta() {
+ let markdown = "\n\n---\ntitle: Foo Bar\n...\nfoobar\n";
+ let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
+ let doc = ast.to_pandoc();
+ let keys: Vec<String> = doc.meta.keys().cloned().collect();
+ assert_eq!(keys, ["title"]);
+ }
+
+ #[test]
+ fn parses_trailing_meta() {
+ let markdown = "foobar\n---\ntitle: Foo Bar\n...\n\n\n";
+ let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
+ let doc = ast.to_pandoc();
+ let keys: Vec<String> = doc.meta.keys().cloned().collect();
+ assert_eq!(keys, ["title"]);
+ }
+
+ #[test]
+ fn full_meta() {
+ let meta = Metadata::new(
+ "\
+title: Foo Bar
+date: today
+classes: [json, text]
+template: rust
+bibliography:
+- foo.bib
+- bar.bib
+bindings:
+- foo.yaml
+- bar.yaml
+functions:
+- foo.py
+- bar.py
+",
+ )
+ .unwrap();
+ assert_eq!(meta.title, "Foo Bar");
+ assert_eq!(meta.date.unwrap(), "today");
+ assert_eq!(meta.classes.unwrap(), &["json", "text"]);
+ assert_eq!(meta.template.unwrap(), "rust");
+ assert_eq!(
+ meta.bibliography.unwrap(),
+ &[path("foo.bib"), path("bar.bib")]
+ );
+ assert_eq!(
+ meta.bindings.unwrap(),
+ &[path("foo.yaml"), path("bar.yaml")]
+ );
+ assert_eq!(meta.functions.unwrap(), &[path("foo.py"), path("bar.py")]);
+ }
+
+ fn path(s: &str) -> PathBuf {
+ PathBuf::from(s)
+ }
+}