summaryrefslogtreecommitdiff
path: root/src/ast.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/ast.rs')
-rw-r--r--src/ast.rs484
1 files changed, 0 insertions, 484 deletions
diff --git a/src/ast.rs b/src/ast.rs
deleted file mode 100644
index eb10efc..0000000
--- a/src/ast.rs
+++ /dev/null
@@ -1,484 +0,0 @@
-use lazy_static::lazy_static;
-use log::trace;
-use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc};
-use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
-use regex::Regex;
-use serde::Deserialize;
-use std::collections::BTreeMap;
-use std::path::{Path, PathBuf};
-
-lazy_static! {
- // Pattern that recognises a YAML block at the beginning of a file.
- static ref LEADING_YAML_PATTERN: Regex = Regex::new(r"^(?:\S*\n)*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?P<text>(.*\n)*)$").unwrap();
-
-
- // Pattern that recognises a YAML block at the end of a file.
- static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap();
-}
-
-/// An abstract syntax tree representation of a Markdown file.
-///
-/// This represents a Markdown file as an abstract syntax tree
-/// compatible with Pandoc's AST. The document YAML metadata MUST be
-/// at the top or bottom of the file, excluding leading or trailing
-/// empty lines.
-#[derive(Debug)]
-pub struct AbstractSyntaxTree {
- blocks: Vec<Block>,
- meta: Map<String, MetaValue>,
-}
-
-impl AbstractSyntaxTree {
- // Create a new AST.
- //
- // Note that this is not public.
- fn new(meta: Map<String, MetaValue>, blocks: Vec<Block>) -> Self {
- Self { blocks, meta }
- }
-
- /// Return a Pandoc-compatible AST.
- pub fn to_pandoc(&self) -> Pandoc {
- Pandoc {
- meta: self.meta.clone(),
- blocks: self.blocks.clone(),
- pandoc_api_version: vec![1, 20],
- }
- }
-}
-
-impl std::str::FromStr for AbstractSyntaxTree {
- type Err = Error;
-
- /// Create an abstract syntax tree from a string.
- fn from_str(markdown: &str) -> Result<Self, Self::Err> {
- trace!("Parsing markdown");
- let ast = if let Some((yaml, markdown)) = get_yaml(&LEADING_YAML_PATTERN, markdown) {
- trace!("Found leading YAML: {:?}", yaml);
- let meta = Metadata::new(yaml)?.to_map();
- let blocks = parse_blocks(markdown);
- AbstractSyntaxTree::new(meta, blocks)
- } else if let Some((yaml, _markdown)) = get_yaml(&TRAILING_YAML_PATTERN, markdown) {
- trace!("Found trailing YAML: {:?}", yaml);
- let meta = Metadata::new(yaml)?.to_map();
- let blocks = parse_blocks(markdown);
- AbstractSyntaxTree::new(meta, blocks)
- } else {
- trace!("No YAML to be found");
- let blocks = parse_blocks(markdown);
- AbstractSyntaxTree::new(Map::new(), blocks)
- };
- trace!("Parsing markdown: OK");
- Ok(ast)
- }
-}
-
-// Extract a YAML metadata block using a given regex.
-fn get_yaml<'a>(pat: &Regex, markdown: &'a str) -> Option<(&'a str, &'a str)> {
- trace!("Markdown: {:?}", markdown);
- if let Some(c) = pat.captures(markdown) {
- trace!("YAML regex matches: {:?}", c);
- let yaml = c.name("yaml");
- let text = c.name("text");
- trace!("YAML metadata: {:?}", yaml);
- trace!("markdown: {:?}", text);
- if yaml.is_some() && text.is_some() {
- trace!("YAML regex captures YAML and text");
- let yaml = yaml?;
- let text = text?;
- let yaml = &markdown[yaml.start()..yaml.end()];
- let text = &markdown[text.start()..text.end()];
- assert!(yaml.starts_with("---"));
- assert!(yaml.ends_with("...\n"));
- return Some((yaml, text));
- } else {
- trace!("YAML regex fails to capture YAML");
- }
- } else {
- trace!("YAML regex does not match");
- }
- None
-}
-
-// Parse Markdown into a sequence of Blocks.
-fn parse_blocks(markdown: &str) -> Vec<Block> {
- trace!("Parsing blocks");
-
- // Define the Markdown parser.
- let mut options = Options::empty();
- options.insert(Options::ENABLE_TABLES);
- options.insert(Options::ENABLE_FOOTNOTES);
- options.insert(Options::ENABLE_STRIKETHROUGH);
- options.insert(Options::ENABLE_TASKLISTS);
- options.insert(Options::ENABLE_SMART_PUNCTUATION);
- let parser = Parser::new_ext(markdown, options);
-
- // The sequence of blocks that represents the parsed document.
- let mut blocks = vec![];
-
- // The current set of inline elements we've collected. This gets
- // emptied whenever we finish a block.
- let mut inlines: Vec<Inline> = vec![];
-
- for event in parser {
- trace!("Parsing event: {:?}", event);
- match event {
- // We ignore these for now. They're not needed for codegen.
- Event::Html(_)
- | Event::FootnoteReference(_)
- | Event::SoftBreak
- | Event::HardBreak
- | Event::Rule
- | Event::TaskListMarker(_) => (),
-
- // Inline text of various kinds.
- Event::Text(text) => inlines.push(inline_text(&text)),
- Event::Code(text) => inlines.push(inline_code(&text)),
-
- // We only handle the end events.
- Event::Start(_) => (),
-
- // End of a block or inline.
- Event::End(tag) => match tag {
- // Collect inline elements for later inclusion in a block.
- Tag::Emphasis | Tag::Strong | Tag::Strikethrough => {
- inline_from_inlines(&tag, &mut inlines)
- }
- Tag::Paragraph => blocks.push(paragraph(&mut inlines)),
- Tag::Heading(level, _fragment, _classes) => {
- blocks.push(heading(level as i64, &mut inlines))
- }
- Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)),
- Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)),
- // We don't handle anything else yet.
- _ => (),
- },
- }
- }
-
- // We MUST have emptied all inline elements.
- // assert!(inlines.is_empty());
-
- trace!("Parsing blocks: OK");
- blocks
-}
-
-fn inline_text(text: &str) -> Inline {
- Inline::Str(text.to_string())
-}
-
-fn inline_code(text: &str) -> Inline {
- let attr = ("".to_string(), vec![], vec![]);
- Inline::Code(attr, text.to_string())
-}
-
-fn paragraph(inlines: &mut Vec<Inline>) -> Block {
- Block::Para(std::mem::take(inlines))
-}
-
-fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block {
- let attr = ("".to_string(), vec![], vec![]);
- Block::Header(level, attr, std::mem::take(inlines))
-}
-
-fn image_block(dest: &str, title: &str) -> Block {
- let attr = ("".to_string(), vec![], vec![]);
- Block::Para(vec![Inline::Image(
- attr,
- vec![],
- (dest.to_string(), title.to_string()),
- )])
-}
-
-fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block {
- trace!("code block: {:?}", kind);
- let attr = if let CodeBlockKind::Fenced(lang) = kind {
- trace!("fenced code block, lang={:?}", lang);
- parse_code_block_attrs(lang)
- } else {
- trace!("indented code block");
- parse_code_block_attrs("")
- };
- trace!("code block attrs: {:?}", attr);
- let mut code = String::new();
- for inline in inlines.drain(0..) {
- let text = plain_text_inline(inline);
- code.push_str(&text);
- }
- // pulldown_cmark and pandoc differ in their codeblock handling,
- // pulldown_cmark has an extra newline which we trim for now to be
- // compatible with pandoc's parsing
- if !code.is_empty() {
- assert_eq!(code.pop(), Some('\n'));
- }
- Block::CodeBlock(attr, code)
-}
-
-fn plain_text_inline(inline: Inline) -> String {
- match inline {
- Inline::Str(text) => text,
- Inline::Code(_, text) => text,
- Inline::Emph(inlines) => {
- let mut text = String::new();
- for inline in inlines {
- text.push_str(&plain_text_inline(inline));
- }
- text
- }
- _ => panic!("not text in code block: {:?}", inline),
- }
-}
-
-fn parse_code_block_attrs(attrs: &str) -> Attr {
- trace!("parsing code block attrs: {:?}", attrs);
- let mut id = "".to_string();
- let mut classes = vec![];
- let mut keyvalues = vec![];
- if attrs.starts_with('{') && attrs.ends_with('}') {
- let attrs = &attrs[1..attrs.len() - 1];
- for word in attrs.split_ascii_whitespace() {
- if let Some(x) = word.strip_prefix('#') {
- id = x.to_string();
- } else if let Some(x) = word.strip_prefix('.') {
- classes.push(x.to_string());
- } else if let Some(i) = word.find('=') {
- let k = &word[..i];
- let v = &word[i + 1..];
- keyvalues.push((k.to_string(), v.to_string()));
- }
- }
- } else if !attrs.is_empty() {
- classes.push(attrs.to_string());
- }
- (id, classes, keyvalues)
-}
-
-fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) {
- let new_inlines = inlines.clone();
- inlines.clear();
-
- let inline = match tag {
- Tag::Emphasis => Inline::Emph(new_inlines),
- Tag::Strong => Inline::Strong(new_inlines),
- Tag::Strikethrough => Inline::Strikeout(new_inlines),
- _ => unreachable!(),
- };
-
- inlines.push(inline);
-}
-
-/// Errors from Markdown parsing.
-#[derive(Debug, thiserror::Error)]
-pub enum Error {
- #[error(transparent)]
- Regex(#[from] regex::Error),
-
- #[error(transparent)]
- Yaml(#[from] serde_yaml::Error),
-}
-
-// Document metadata.
-//
-// This is expressed in the Markdown input file as an embedded YAML
-// block.
-//
-// Note that this structure needs to be able to capture any metadata
-// block we can work with, in any input file. By being strict here we
-// make it easier to tell the user when a metadata block has, say, a
-// misspelled field.
-#[derive(Debug, Default, Deserialize)]
-#[serde(deny_unknown_fields)]
-struct Metadata {
- title: String,
- subtitle: Option<String>,
- author: Option<String>,
- date: Option<String>,
- classes: Option<Vec<String>>,
- bibliography: Option<Vec<PathBuf>>,
- bindings: Option<Vec<PathBuf>>,
- documentclass: Option<String>,
- #[serde(default)]
- impls: BTreeMap<String, Vec<PathBuf>>,
-}
-
-impl Metadata {
- fn new(yaml_text: &str) -> Result<Self, Error> {
- trace!("Parsing YAML");
- let meta: Self = serde_yaml::from_str(yaml_text)?;
- Ok(meta)
- }
-
- fn to_map(&self) -> Map<String, MetaValue> {
- trace!("Creating metadata map from parsed YAML");
- let mut map: Map<String, MetaValue> = Map::new();
- map.insert(s("title"), meta_string(&self.title));
- if let Some(v) = &self.subtitle {
- map.insert(s("subtitle"), meta_string(v));
- }
- if let Some(v) = &self.author {
- map.insert(s("author"), meta_string(v));
- }
- if let Some(v) = &self.date {
- map.insert(s("date"), meta_string(v));
- }
- if let Some(v) = &self.classes {
- map.insert(s("classes"), meta_strings(v));
- }
- if !self.impls.is_empty() {
- let impls = self
- .impls
- .iter()
- .map(|(k, v)| (k.to_owned(), Box::new(meta_path_bufs(v))))
- .collect();
- map.insert(s("impls"), MetaValue::MetaMap(impls));
- }
- if let Some(v) = &self.bibliography {
- map.insert(s("bibliography"), meta_path_bufs(v));
- }
- if let Some(v) = &self.bindings {
- map.insert(s("bindings"), meta_path_bufs(v));
- }
- if let Some(v) = &self.documentclass {
- map.insert(s("documentclass"), meta_string(v));
- }
- trace!("Created metadata map from parsed YAML");
- map
- }
-}
-
-fn s(s: &str) -> String {
- s.to_string()
-}
-
-fn meta_string(s: &str) -> MetaValue {
- MetaValue::MetaString(s.to_string())
-}
-
-fn meta_strings(v: &[String]) -> MetaValue {
- MetaValue::MetaList(v.iter().map(|s| meta_string(s)).collect())
-}
-
-fn meta_path_buf(p: &Path) -> MetaValue {
- meta_string(&p.display().to_string())
-}
-
-fn meta_path_bufs(v: &[PathBuf]) -> MetaValue {
- MetaValue::MetaList(v.iter().map(|p| meta_path_buf(p)).collect())
-}
-
-#[cfg(test)]
-mod test {
- use super::{parse_code_block_attrs, AbstractSyntaxTree, Metadata};
- use super::{Block, Inline};
- use std::path::PathBuf;
- use std::str::FromStr;
-
- #[test]
- fn code_block_attrs() {
- assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![]));
- assert_eq!(
- parse_code_block_attrs("foo"),
- ("".to_string(), vec!["foo".to_string()], vec![])
- );
- assert_eq!(
- parse_code_block_attrs("{#foo}"),
- ("foo".to_string(), vec![], vec![])
- );
- assert_eq!(
- parse_code_block_attrs("{#foo .file bar=yo}"),
- (
- "foo".to_string(),
- vec!["file".to_string()],
- vec![("bar".to_string(), "yo".to_string())]
- )
- );
- }
-
- #[test]
- fn empty_input() {
- let ast = AbstractSyntaxTree::from_str("").unwrap();
- let doc = ast.to_pandoc();
- assert!(doc.blocks.is_empty());
- assert!(doc.meta.is_empty());
- assert!(!doc.pandoc_api_version.is_empty());
- }
-
- #[test]
- fn simple() {
- let ast = AbstractSyntaxTree::from_str(
- "\
- # Introduction \n\
- \n\
- First paragraph.\n\
- ",
- )
- .unwrap();
- let doc = ast.to_pandoc();
- assert!(doc.meta.is_empty());
- assert!(!doc.pandoc_api_version.is_empty());
-
- let attr = ("".to_string(), vec![], vec![]);
- let h = Block::Header(1, attr, vec![Inline::Str("Introduction".to_string())]);
- let para = Block::Para(vec![Inline::Str("First paragraph.".to_string())]);
- assert_eq!(doc.blocks, &[h, para]);
- }
-
- #[test]
- fn parses_leading_meta() {
- let markdown = "\n\n---\ntitle: Foo Bar\n...\nfoobar\n";
- let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
- let doc = ast.to_pandoc();
- let keys: Vec<String> = doc.meta.keys().cloned().collect();
- assert_eq!(keys, ["title"]);
- }
-
- #[test]
- fn parses_trailing_meta() {
- let markdown = "foobar\n---\ntitle: Foo Bar\n...\n\n\n";
- let ast = AbstractSyntaxTree::from_str(markdown).unwrap();
- let doc = ast.to_pandoc();
- let keys: Vec<String> = doc.meta.keys().cloned().collect();
- assert_eq!(keys, ["title"]);
- }
-
- #[test]
- fn full_meta() {
- let meta = Metadata::new(
- "\
-title: Foo Bar
-date: today
-classes: [json, text]
-impls:
- python:
- - foo.py
- - bar.py
-bibliography:
-- foo.bib
-- bar.bib
-bindings:
-- foo.yaml
-- bar.yaml
-",
- )
- .unwrap();
- assert_eq!(meta.title, "Foo Bar");
- assert_eq!(meta.date.unwrap(), "today");
- assert_eq!(meta.classes.unwrap(), &["json", "text"]);
- assert_eq!(
- meta.bibliography.unwrap(),
- &[path("foo.bib"), path("bar.bib")]
- );
- assert_eq!(
- meta.bindings.unwrap(),
- &[path("foo.yaml"), path("bar.yaml")]
- );
- assert!(!meta.impls.is_empty());
- for (k, v) in meta.impls.iter() {
- assert_eq!(k, "python");
- assert_eq!(v, &[path("foo.py"), path("bar.py")]);
- }
- }
-
- fn path(s: &str) -> PathBuf {
- PathBuf::from(s)
- }
-}