diff options
Diffstat (limited to 'src/ast.rs')
-rw-r--r-- | src/ast.rs | 484 |
1 files changed, 0 insertions, 484 deletions
diff --git a/src/ast.rs b/src/ast.rs deleted file mode 100644 index eb10efc..0000000 --- a/src/ast.rs +++ /dev/null @@ -1,484 +0,0 @@ -use lazy_static::lazy_static; -use log::trace; -use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc}; -use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag}; -use regex::Regex; -use serde::Deserialize; -use std::collections::BTreeMap; -use std::path::{Path, PathBuf}; - -lazy_static! { - // Pattern that recognises a YAML block at the beginning of a file. - static ref LEADING_YAML_PATTERN: Regex = Regex::new(r"^(?:\S*\n)*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?P<text>(.*\n)*)$").unwrap(); - - - // Pattern that recognises a YAML block at the end of a file. - static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap(); -} - -/// An abstract syntax tree representation of a Markdown file. -/// -/// This represents a Markdown file as an abstract syntax tree -/// compatible with Pandoc's AST. The document YAML metadata MUST be -/// at the top or bottom of the file, excluding leading or trailing -/// empty lines. -#[derive(Debug)] -pub struct AbstractSyntaxTree { - blocks: Vec<Block>, - meta: Map<String, MetaValue>, -} - -impl AbstractSyntaxTree { - // Create a new AST. - // - // Note that this is not public. - fn new(meta: Map<String, MetaValue>, blocks: Vec<Block>) -> Self { - Self { blocks, meta } - } - - /// Return a Pandoc-compatible AST. - pub fn to_pandoc(&self) -> Pandoc { - Pandoc { - meta: self.meta.clone(), - blocks: self.blocks.clone(), - pandoc_api_version: vec![1, 20], - } - } -} - -impl std::str::FromStr for AbstractSyntaxTree { - type Err = Error; - - /// Create an abstract syntax tree from a string. - fn from_str(markdown: &str) -> Result<Self, Self::Err> { - trace!("Parsing markdown"); - let ast = if let Some((yaml, markdown)) = get_yaml(&LEADING_YAML_PATTERN, markdown) { - trace!("Found leading YAML: {:?}", yaml); - let meta = Metadata::new(yaml)?.to_map(); - let blocks = parse_blocks(markdown); - AbstractSyntaxTree::new(meta, blocks) - } else if let Some((yaml, _markdown)) = get_yaml(&TRAILING_YAML_PATTERN, markdown) { - trace!("Found trailing YAML: {:?}", yaml); - let meta = Metadata::new(yaml)?.to_map(); - let blocks = parse_blocks(markdown); - AbstractSyntaxTree::new(meta, blocks) - } else { - trace!("No YAML to be found"); - let blocks = parse_blocks(markdown); - AbstractSyntaxTree::new(Map::new(), blocks) - }; - trace!("Parsing markdown: OK"); - Ok(ast) - } -} - -// Extract a YAML metadata block using a given regex. -fn get_yaml<'a>(pat: &Regex, markdown: &'a str) -> Option<(&'a str, &'a str)> { - trace!("Markdown: {:?}", markdown); - if let Some(c) = pat.captures(markdown) { - trace!("YAML regex matches: {:?}", c); - let yaml = c.name("yaml"); - let text = c.name("text"); - trace!("YAML metadata: {:?}", yaml); - trace!("markdown: {:?}", text); - if yaml.is_some() && text.is_some() { - trace!("YAML regex captures YAML and text"); - let yaml = yaml?; - let text = text?; - let yaml = &markdown[yaml.start()..yaml.end()]; - let text = &markdown[text.start()..text.end()]; - assert!(yaml.starts_with("---")); - assert!(yaml.ends_with("...\n")); - return Some((yaml, text)); - } else { - trace!("YAML regex fails to capture YAML"); - } - } else { - trace!("YAML regex does not match"); - } - None -} - -// Parse Markdown into a sequence of Blocks. -fn parse_blocks(markdown: &str) -> Vec<Block> { - trace!("Parsing blocks"); - - // Define the Markdown parser. - let mut options = Options::empty(); - options.insert(Options::ENABLE_TABLES); - options.insert(Options::ENABLE_FOOTNOTES); - options.insert(Options::ENABLE_STRIKETHROUGH); - options.insert(Options::ENABLE_TASKLISTS); - options.insert(Options::ENABLE_SMART_PUNCTUATION); - let parser = Parser::new_ext(markdown, options); - - // The sequence of blocks that represents the parsed document. - let mut blocks = vec![]; - - // The current set of inline elements we've collected. This gets - // emptied whenever we finish a block. - let mut inlines: Vec<Inline> = vec![]; - - for event in parser { - trace!("Parsing event: {:?}", event); - match event { - // We ignore these for now. They're not needed for codegen. - Event::Html(_) - | Event::FootnoteReference(_) - | Event::SoftBreak - | Event::HardBreak - | Event::Rule - | Event::TaskListMarker(_) => (), - - // Inline text of various kinds. - Event::Text(text) => inlines.push(inline_text(&text)), - Event::Code(text) => inlines.push(inline_code(&text)), - - // We only handle the end events. - Event::Start(_) => (), - - // End of a block or inline. - Event::End(tag) => match tag { - // Collect inline elements for later inclusion in a block. - Tag::Emphasis | Tag::Strong | Tag::Strikethrough => { - inline_from_inlines(&tag, &mut inlines) - } - Tag::Paragraph => blocks.push(paragraph(&mut inlines)), - Tag::Heading(level, _fragment, _classes) => { - blocks.push(heading(level as i64, &mut inlines)) - } - Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)), - Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)), - // We don't handle anything else yet. - _ => (), - }, - } - } - - // We MUST have emptied all inline elements. - // assert!(inlines.is_empty()); - - trace!("Parsing blocks: OK"); - blocks -} - -fn inline_text(text: &str) -> Inline { - Inline::Str(text.to_string()) -} - -fn inline_code(text: &str) -> Inline { - let attr = ("".to_string(), vec![], vec![]); - Inline::Code(attr, text.to_string()) -} - -fn paragraph(inlines: &mut Vec<Inline>) -> Block { - Block::Para(std::mem::take(inlines)) -} - -fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block { - let attr = ("".to_string(), vec![], vec![]); - Block::Header(level, attr, std::mem::take(inlines)) -} - -fn image_block(dest: &str, title: &str) -> Block { - let attr = ("".to_string(), vec![], vec![]); - Block::Para(vec![Inline::Image( - attr, - vec![], - (dest.to_string(), title.to_string()), - )]) -} - -fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block { - trace!("code block: {:?}", kind); - let attr = if let CodeBlockKind::Fenced(lang) = kind { - trace!("fenced code block, lang={:?}", lang); - parse_code_block_attrs(lang) - } else { - trace!("indented code block"); - parse_code_block_attrs("") - }; - trace!("code block attrs: {:?}", attr); - let mut code = String::new(); - for inline in inlines.drain(0..) { - let text = plain_text_inline(inline); - code.push_str(&text); - } - // pulldown_cmark and pandoc differ in their codeblock handling, - // pulldown_cmark has an extra newline which we trim for now to be - // compatible with pandoc's parsing - if !code.is_empty() { - assert_eq!(code.pop(), Some('\n')); - } - Block::CodeBlock(attr, code) -} - -fn plain_text_inline(inline: Inline) -> String { - match inline { - Inline::Str(text) => text, - Inline::Code(_, text) => text, - Inline::Emph(inlines) => { - let mut text = String::new(); - for inline in inlines { - text.push_str(&plain_text_inline(inline)); - } - text - } - _ => panic!("not text in code block: {:?}", inline), - } -} - -fn parse_code_block_attrs(attrs: &str) -> Attr { - trace!("parsing code block attrs: {:?}", attrs); - let mut id = "".to_string(); - let mut classes = vec![]; - let mut keyvalues = vec![]; - if attrs.starts_with('{') && attrs.ends_with('}') { - let attrs = &attrs[1..attrs.len() - 1]; - for word in attrs.split_ascii_whitespace() { - if let Some(x) = word.strip_prefix('#') { - id = x.to_string(); - } else if let Some(x) = word.strip_prefix('.') { - classes.push(x.to_string()); - } else if let Some(i) = word.find('=') { - let k = &word[..i]; - let v = &word[i + 1..]; - keyvalues.push((k.to_string(), v.to_string())); - } - } - } else if !attrs.is_empty() { - classes.push(attrs.to_string()); - } - (id, classes, keyvalues) -} - -fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) { - let new_inlines = inlines.clone(); - inlines.clear(); - - let inline = match tag { - Tag::Emphasis => Inline::Emph(new_inlines), - Tag::Strong => Inline::Strong(new_inlines), - Tag::Strikethrough => Inline::Strikeout(new_inlines), - _ => unreachable!(), - }; - - inlines.push(inline); -} - -/// Errors from Markdown parsing. -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error(transparent)] - Regex(#[from] regex::Error), - - #[error(transparent)] - Yaml(#[from] serde_yaml::Error), -} - -// Document metadata. -// -// This is expressed in the Markdown input file as an embedded YAML -// block. -// -// Note that this structure needs to be able to capture any metadata -// block we can work with, in any input file. By being strict here we -// make it easier to tell the user when a metadata block has, say, a -// misspelled field. -#[derive(Debug, Default, Deserialize)] -#[serde(deny_unknown_fields)] -struct Metadata { - title: String, - subtitle: Option<String>, - author: Option<String>, - date: Option<String>, - classes: Option<Vec<String>>, - bibliography: Option<Vec<PathBuf>>, - bindings: Option<Vec<PathBuf>>, - documentclass: Option<String>, - #[serde(default)] - impls: BTreeMap<String, Vec<PathBuf>>, -} - -impl Metadata { - fn new(yaml_text: &str) -> Result<Self, Error> { - trace!("Parsing YAML"); - let meta: Self = serde_yaml::from_str(yaml_text)?; - Ok(meta) - } - - fn to_map(&self) -> Map<String, MetaValue> { - trace!("Creating metadata map from parsed YAML"); - let mut map: Map<String, MetaValue> = Map::new(); - map.insert(s("title"), meta_string(&self.title)); - if let Some(v) = &self.subtitle { - map.insert(s("subtitle"), meta_string(v)); - } - if let Some(v) = &self.author { - map.insert(s("author"), meta_string(v)); - } - if let Some(v) = &self.date { - map.insert(s("date"), meta_string(v)); - } - if let Some(v) = &self.classes { - map.insert(s("classes"), meta_strings(v)); - } - if !self.impls.is_empty() { - let impls = self - .impls - .iter() - .map(|(k, v)| (k.to_owned(), Box::new(meta_path_bufs(v)))) - .collect(); - map.insert(s("impls"), MetaValue::MetaMap(impls)); - } - if let Some(v) = &self.bibliography { - map.insert(s("bibliography"), meta_path_bufs(v)); - } - if let Some(v) = &self.bindings { - map.insert(s("bindings"), meta_path_bufs(v)); - } - if let Some(v) = &self.documentclass { - map.insert(s("documentclass"), meta_string(v)); - } - trace!("Created metadata map from parsed YAML"); - map - } -} - -fn s(s: &str) -> String { - s.to_string() -} - -fn meta_string(s: &str) -> MetaValue { - MetaValue::MetaString(s.to_string()) -} - -fn meta_strings(v: &[String]) -> MetaValue { - MetaValue::MetaList(v.iter().map(|s| meta_string(s)).collect()) -} - -fn meta_path_buf(p: &Path) -> MetaValue { - meta_string(&p.display().to_string()) -} - -fn meta_path_bufs(v: &[PathBuf]) -> MetaValue { - MetaValue::MetaList(v.iter().map(|p| meta_path_buf(p)).collect()) -} - -#[cfg(test)] -mod test { - use super::{parse_code_block_attrs, AbstractSyntaxTree, Metadata}; - use super::{Block, Inline}; - use std::path::PathBuf; - use std::str::FromStr; - - #[test] - fn code_block_attrs() { - assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![])); - assert_eq!( - parse_code_block_attrs("foo"), - ("".to_string(), vec!["foo".to_string()], vec![]) - ); - assert_eq!( - parse_code_block_attrs("{#foo}"), - ("foo".to_string(), vec![], vec![]) - ); - assert_eq!( - parse_code_block_attrs("{#foo .file bar=yo}"), - ( - "foo".to_string(), - vec!["file".to_string()], - vec![("bar".to_string(), "yo".to_string())] - ) - ); - } - - #[test] - fn empty_input() { - let ast = AbstractSyntaxTree::from_str("").unwrap(); - let doc = ast.to_pandoc(); - assert!(doc.blocks.is_empty()); - assert!(doc.meta.is_empty()); - assert!(!doc.pandoc_api_version.is_empty()); - } - - #[test] - fn simple() { - let ast = AbstractSyntaxTree::from_str( - "\ - # Introduction \n\ - \n\ - First paragraph.\n\ - ", - ) - .unwrap(); - let doc = ast.to_pandoc(); - assert!(doc.meta.is_empty()); - assert!(!doc.pandoc_api_version.is_empty()); - - let attr = ("".to_string(), vec![], vec![]); - let h = Block::Header(1, attr, vec![Inline::Str("Introduction".to_string())]); - let para = Block::Para(vec![Inline::Str("First paragraph.".to_string())]); - assert_eq!(doc.blocks, &[h, para]); - } - - #[test] - fn parses_leading_meta() { - let markdown = "\n\n---\ntitle: Foo Bar\n...\nfoobar\n"; - let ast = AbstractSyntaxTree::from_str(markdown).unwrap(); - let doc = ast.to_pandoc(); - let keys: Vec<String> = doc.meta.keys().cloned().collect(); - assert_eq!(keys, ["title"]); - } - - #[test] - fn parses_trailing_meta() { - let markdown = "foobar\n---\ntitle: Foo Bar\n...\n\n\n"; - let ast = AbstractSyntaxTree::from_str(markdown).unwrap(); - let doc = ast.to_pandoc(); - let keys: Vec<String> = doc.meta.keys().cloned().collect(); - assert_eq!(keys, ["title"]); - } - - #[test] - fn full_meta() { - let meta = Metadata::new( - "\ -title: Foo Bar -date: today -classes: [json, text] -impls: - python: - - foo.py - - bar.py -bibliography: -- foo.bib -- bar.bib -bindings: -- foo.yaml -- bar.yaml -", - ) - .unwrap(); - assert_eq!(meta.title, "Foo Bar"); - assert_eq!(meta.date.unwrap(), "today"); - assert_eq!(meta.classes.unwrap(), &["json", "text"]); - assert_eq!( - meta.bibliography.unwrap(), - &[path("foo.bib"), path("bar.bib")] - ); - assert_eq!( - meta.bindings.unwrap(), - &[path("foo.yaml"), path("bar.yaml")] - ); - assert!(!meta.impls.is_empty()); - for (k, v) in meta.impls.iter() { - assert_eq!(k, "python"); - assert_eq!(v, &[path("foo.py"), path("bar.py")]); - } - } - - fn path(s: &str) -> PathBuf { - PathBuf::from(s) - } -} |