use lazy_static::lazy_static; use log::trace; use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc}; use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag}; use regex::Regex; use serde::Deserialize; use std::collections::BTreeMap; use std::path::{Path, PathBuf}; lazy_static! { // Pattern that recognises a YAML block at the beginning of a file. static ref LEADING_YAML_PATTERN: Regex = Regex::new(r"^(?:\S*\n)*(?P-{3,}\n([^.].*\n)*\.{3,}\n)(?P(.*\n)*)$").unwrap(); // Pattern that recognises a YAML block at the end of a file. static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P(.*\n)*)\n*(?P-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap(); } /// An abstract syntax tree representation of a Markdown file. /// /// This represents a Markdown file as an abstract syntax tree /// compatible with Pandoc's AST. The document YAML metadata MUST be /// at the top or bottom of the file, excluding leading or trailing /// empty lines. #[derive(Debug)] pub struct AbstractSyntaxTree { blocks: Vec, meta: YamlMetadata, } impl AbstractSyntaxTree { /// Create a new AST. pub fn new(meta: YamlMetadata, markdown: &str) -> Self { let blocks = parse_blocks(markdown); Self { blocks, meta } } /// Return a Pandoc-compatible AST. pub fn to_pandoc(&self) -> Pandoc { Pandoc { meta: self.meta.to_map(), blocks: self.blocks.clone(), pandoc_api_version: vec![1, 20], } } } /// Extract YAML metadata from a Markdown document. pub fn extract_metadata(markdown: &str) -> Result<(YamlMetadata, &str), Error> { trace!("Extracting YAML from Markdown"); let (yaml, md) = if let Some((yaml, markdown)) = get_yaml(&LEADING_YAML_PATTERN, markdown) { trace!("Found leading YAML: {:?}", yaml); (yaml, markdown) } else if let Some((yaml, _markdown)) = get_yaml(&TRAILING_YAML_PATTERN, markdown) { trace!("Found trailing YAML: {:?}", yaml); (yaml, markdown) } else { trace!("No YAML to be found"); return Err(Error::NoMetadata); }; let meta = YamlMetadata::new(yaml)?; trace!("Parsing markdown: OK"); Ok((meta, md)) } // Extract a YAML metadata block using a given regex. fn get_yaml<'a>(pat: &Regex, markdown: &'a str) -> Option<(&'a str, &'a str)> { trace!("Markdown: {:?}", markdown); if let Some(c) = pat.captures(markdown) { trace!("YAML regex matches: {:?}", c); let yaml = c.name("yaml"); let text = c.name("text"); trace!("YAML metadata: {:?}", yaml); trace!("markdown: {:?}", text); if yaml.is_some() && text.is_some() { trace!("YAML regex captures YAML and text"); let yaml = yaml?; let text = text?; let yaml = &markdown[yaml.start()..yaml.end()]; let text = &markdown[text.start()..text.end()]; assert!(yaml.starts_with("---")); assert!(yaml.ends_with("...\n")); return Some((yaml, text)); } else { trace!("YAML regex fails to capture YAML"); } } else { trace!("YAML regex does not match"); } None } // Parse Markdown into a sequence of Blocks. fn parse_blocks(markdown: &str) -> Vec { trace!("Parsing blocks"); // Define the Markdown parser. let mut options = Options::empty(); options.insert(Options::ENABLE_TABLES); options.insert(Options::ENABLE_FOOTNOTES); options.insert(Options::ENABLE_STRIKETHROUGH); options.insert(Options::ENABLE_TASKLISTS); options.insert(Options::ENABLE_SMART_PUNCTUATION); let parser = Parser::new_ext(markdown, options); // The sequence of blocks that represents the parsed document. let mut blocks = vec![]; // The current set of inline elements we've collected. This gets // emptied whenever we finish a block. let mut inlines: Vec = vec![]; for event in parser { trace!("Parsing event: {:?}", event); match event { // We ignore these for now. They're not needed for codegen. Event::Html(_) | Event::FootnoteReference(_) | Event::SoftBreak | Event::HardBreak | Event::Rule | Event::TaskListMarker(_) => (), // Inline text of various kinds. Event::Text(text) => inlines.push(inline_text(&text)), Event::Code(text) => inlines.push(inline_code(&text)), // We only handle the end events. Event::Start(_) => (), // End of a block or inline. Event::End(tag) => match tag { // Collect inline elements for later inclusion in a block. Tag::Emphasis | Tag::Strong | Tag::Strikethrough => { inline_from_inlines(&tag, &mut inlines) } Tag::Paragraph => blocks.push(paragraph(&mut inlines)), Tag::Heading(level, _fragment, _classes) => { blocks.push(heading(level as i64, &mut inlines)) } Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)), Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)), // We don't handle anything else yet. _ => (), }, } } // We MUST have emptied all inline elements. // assert!(inlines.is_empty()); trace!("Parsing blocks: OK"); blocks } fn inline_text(text: &str) -> Inline { Inline::Str(text.to_string()) } fn inline_code(text: &str) -> Inline { let attr = ("".to_string(), vec![], vec![]); Inline::Code(attr, text.to_string()) } fn paragraph(inlines: &mut Vec) -> Block { Block::Para(std::mem::take(inlines)) } fn heading(level: i64, inlines: &mut Vec) -> Block { let attr = ("".to_string(), vec![], vec![]); Block::Header(level, attr, std::mem::take(inlines)) } fn image_block(dest: &str, title: &str) -> Block { let attr = ("".to_string(), vec![], vec![]); Block::Para(vec![Inline::Image( attr, vec![], (dest.to_string(), title.to_string()), )]) } fn code_block(kind: &CodeBlockKind, inlines: &mut Vec) -> Block { trace!("code block: {:?}", kind); let attr = if let CodeBlockKind::Fenced(lang) = kind { trace!("fenced code block, lang={:?}", lang); parse_code_block_attrs(lang) } else { trace!("indented code block"); parse_code_block_attrs("") }; trace!("code block attrs: {:?}", attr); let mut code = String::new(); for inline in inlines.drain(0..) { let text = plain_text_inline(inline); code.push_str(&text); } // pulldown_cmark and pandoc differ in their codeblock handling, // pulldown_cmark has an extra newline which we trim for now to be // compatible with pandoc's parsing if !code.is_empty() { assert_eq!(code.pop(), Some('\n')); } Block::CodeBlock(attr, code) } fn plain_text_inline(inline: Inline) -> String { match inline { Inline::Str(text) => text, Inline::Code(_, text) => text, Inline::Emph(inlines) => { let mut text = String::new(); for inline in inlines { text.push_str(&plain_text_inline(inline)); } text } _ => panic!("not text in code block: {:?}", inline), } } fn parse_code_block_attrs(attrs: &str) -> Attr { trace!("parsing code block attrs: {:?}", attrs); let mut id = "".to_string(); let mut classes = vec![]; let mut keyvalues = vec![]; if attrs.starts_with('{') && attrs.ends_with('}') { let attrs = &attrs[1..attrs.len() - 1]; for word in attrs.split_ascii_whitespace() { if let Some(x) = word.strip_prefix('#') { id = x.to_string(); } else if let Some(x) = word.strip_prefix('.') { classes.push(x.to_string()); } else if let Some(i) = word.find('=') { let k = &word[..i]; let v = &word[i + 1..]; keyvalues.push((k.to_string(), v.to_string())); } } } else if !attrs.is_empty() { classes.push(attrs.to_string()); } (id, classes, keyvalues) } fn inline_from_inlines(tag: &Tag, inlines: &mut Vec) { let new_inlines = inlines.clone(); inlines.clear(); let inline = match tag { Tag::Emphasis => Inline::Emph(new_inlines), Tag::Strong => Inline::Strong(new_inlines), Tag::Strikethrough => Inline::Strikeout(new_inlines), _ => unreachable!(), }; inlines.push(inline); } /// Errors from Markdown parsing. #[derive(Debug, thiserror::Error)] pub enum Error { #[error(transparent)] Regex(#[from] regex::Error), #[error("Markdown doesn't contain a YAML block for document metadata")] NoMetadata, #[error(transparent)] Yaml(#[from] serde_yaml::Error), } /// Document metadata. /// /// This is expressed in the Markdown input file as an embedded YAML /// block. /// /// Note that this structure needs to be able to capture any metadata /// block we can work with, in any input file. By being strict here we /// make it easier to tell the user when a metadata block has, say, a /// misspelled field. #[derive(Debug, Default, Deserialize)] #[serde(deny_unknown_fields)] pub struct YamlMetadata { title: String, subtitle: Option, author: Option, date: Option, classes: Option>, bibliography: Option>, bindings: Option>, documentclass: Option, #[serde(default)] impls: BTreeMap>, } impl YamlMetadata { fn new(yaml_text: &str) -> Result { trace!("Parsing YAML"); let meta: Self = serde_yaml::from_str(yaml_text)?; Ok(meta) } fn to_map(&self) -> Map { trace!("Creating metadata map from parsed YAML"); let mut map: Map = Map::new(); map.insert("title".into(), meta_string(&self.title)); if let Some(v) = &self.subtitle { map.insert("subtitle".into(), meta_string(v)); } if let Some(v) = &self.author { map.insert("author".into(), meta_string(v)); } if let Some(v) = &self.date { map.insert("date".into(), meta_string(v)); } if let Some(v) = &self.classes { map.insert("classes".into(), meta_strings(v)); } if !self.impls.is_empty() { let impls = self .impls .iter() .map(|(k, v)| (k.to_owned(), Box::new(meta_path_bufs(v)))) .collect(); map.insert("impls".into(), MetaValue::MetaMap(impls)); } if let Some(v) = &self.bibliography { map.insert("bibliography".into(), meta_path_bufs(v)); } if let Some(v) = &self.bindings { map.insert("bindings".into(), meta_path_bufs(v)); } if let Some(v) = &self.documentclass { map.insert("documentclass".into(), meta_string(v)); } trace!("Created metadata map from parsed YAML"); map } } fn meta_string(s: &str) -> MetaValue { MetaValue::MetaString(s.to_string()) } fn meta_strings(v: &[String]) -> MetaValue { MetaValue::MetaList(v.iter().map(|s| meta_string(s)).collect()) } fn meta_path_buf(p: &Path) -> MetaValue { meta_string(&p.display().to_string()) } fn meta_path_bufs(v: &[PathBuf]) -> MetaValue { MetaValue::MetaList(v.iter().map(|p| meta_path_buf(p)).collect()) } #[cfg(test)] mod test { use super::{extract_metadata, parse_code_block_attrs, AbstractSyntaxTree, YamlMetadata}; use std::path::PathBuf; #[test] fn code_block_attrs() { assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![])); assert_eq!( parse_code_block_attrs("foo"), ("".to_string(), vec!["foo".to_string()], vec![]) ); assert_eq!( parse_code_block_attrs("{#foo}"), ("foo".to_string(), vec![], vec![]) ); assert_eq!( parse_code_block_attrs("{#foo .file bar=yo}"), ( "foo".to_string(), vec!["file".to_string()], vec![("bar".to_string(), "yo".to_string())] ) ); } #[test] fn parses_leading_meta() { let markdown = "\n\n---\ntitle: Foo Bar\n...\nfoobar\n"; let (meta, markdown) = extract_metadata(markdown).unwrap(); let ast = AbstractSyntaxTree::new(meta, markdown); let doc = ast.to_pandoc(); let keys: Vec = doc.meta.keys().cloned().collect(); assert_eq!(keys, ["title"]); } #[test] fn parses_trailing_meta() { let markdown = "foobar\n---\ntitle: Foo Bar\n...\n\n\n"; let (meta, markdown) = extract_metadata(markdown).unwrap(); let ast = AbstractSyntaxTree::new(meta, markdown); let doc = ast.to_pandoc(); let keys: Vec = doc.meta.keys().cloned().collect(); assert_eq!(keys, ["title"]); } #[test] fn full_meta() { let meta = YamlMetadata::new( "\ title: Foo Bar date: today classes: [json, text] impls: python: - foo.py - bar.py bibliography: - foo.bib - bar.bib bindings: - foo.yaml - bar.yaml ", ) .unwrap(); assert_eq!(meta.title, "Foo Bar"); assert_eq!(meta.date.unwrap(), "today"); assert_eq!(meta.classes.unwrap(), &["json", "text"]); assert_eq!( meta.bibliography.unwrap(), &[path("foo.bib"), path("bar.bib")] ); assert_eq!( meta.bindings.unwrap(), &[path("foo.yaml"), path("bar.yaml")] ); assert!(!meta.impls.is_empty()); for (k, v) in meta.impls.iter() { assert_eq!(k, "python"); assert_eq!(v, &[path("foo.py"), path("bar.py")]); } } fn path(s: &str) -> PathBuf { PathBuf::from(s) } }