diff options
author | Lars Wirzenius <liw@liw.fi> | 2023-01-22 11:32:12 +0200 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2023-01-22 11:35:58 +0200 |
commit | 27fac843cd0475026e96545b645c11f15a71d432 (patch) | |
tree | 073bad11a601df79c5ff1ddf24af1ad033094ad0 | |
parent | b940075214d31b632f62a0ba723e63b857f5e5f1 (diff) | |
download | subplot-27fac843cd0475026e96545b645c11f15a71d432.tar.gz |
refactor: drop abstract syntax tree built using pulldown_cmark
We'll want to use the new Markdown type instead.
Sponsored-by: author
-rw-r--r-- | src/ast.rs | 222 | ||||
-rw-r--r-- | src/lib.rs | 2 |
2 files changed, 3 insertions, 221 deletions
@@ -1,7 +1,6 @@ use lazy_static::lazy_static; use log::trace; -use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc}; -use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag}; +use pandoc_ast::{Map, MetaValue}; use regex::Regex; use serde::Deserialize; use serde_yaml::{Mapping, Value}; @@ -17,202 +16,6 @@ lazy_static! { static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap(); } -/// An abstract syntax tree representation of a Markdown file. -/// -/// This represents a Markdown file as an abstract syntax tree -/// compatible with Pandoc's AST. The document YAML metadata MUST be -/// at the top or bottom of the file, excluding leading or trailing -/// empty lines. -#[derive(Debug)] -pub struct AbstractSyntaxTree { - blocks: Vec<Block>, - meta: YamlMetadata, -} - -impl AbstractSyntaxTree { - /// Create a new AST. - pub fn new(meta: YamlMetadata, markdown: &str) -> Self { - let blocks = parse_blocks(markdown); - Self { blocks, meta } - } - - /// Return a Pandoc-compatible AST. - pub fn to_pandoc(&self) -> Pandoc { - Pandoc { - meta: self.meta.to_map(), - blocks: self.blocks.clone(), - pandoc_api_version: vec![1, 20], - } - } -} - -// Parse Markdown into a sequence of Blocks. -fn parse_blocks(markdown: &str) -> Vec<Block> { - trace!("Parsing blocks"); - - // Define the Markdown parser. - let mut options = Options::empty(); - options.insert(Options::ENABLE_TABLES); - options.insert(Options::ENABLE_FOOTNOTES); - options.insert(Options::ENABLE_STRIKETHROUGH); - options.insert(Options::ENABLE_TASKLISTS); - options.insert(Options::ENABLE_SMART_PUNCTUATION); - let parser = Parser::new_ext(markdown, options); - - // The sequence of blocks that represents the parsed document. - let mut blocks = vec![]; - - // The current set of inline elements we've collected. This gets - // emptied whenever we finish a block. - let mut inlines: Vec<Inline> = vec![]; - - for event in parser { - trace!("Parsing event: {:?}", event); - match event { - // We ignore these for now. They're not needed for codegen. - Event::Html(_) - | Event::FootnoteReference(_) - | Event::SoftBreak - | Event::HardBreak - | Event::Rule - | Event::TaskListMarker(_) => (), - - // Inline text of various kinds. - Event::Text(text) => inlines.push(inline_text(&text)), - Event::Code(text) => inlines.push(inline_code(&text)), - - // We only handle the end events. - Event::Start(_) => (), - - // End of a block or inline. - Event::End(tag) => match tag { - // Collect inline elements for later inclusion in a block. - Tag::Emphasis | Tag::Strong | Tag::Strikethrough => { - inline_from_inlines(&tag, &mut inlines) - } - Tag::Paragraph => blocks.push(paragraph(&mut inlines)), - Tag::Heading(level, _fragment, _classes) => { - blocks.push(heading(level as i64, &mut inlines)) - } - Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)), - Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)), - // We don't handle anything else yet. - _ => (), - }, - } - } - - // We MUST have emptied all inline elements. - // assert!(inlines.is_empty()); - - trace!("Parsing blocks: OK"); - blocks -} - -fn inline_text(text: &str) -> Inline { - Inline::Str(text.to_string()) -} - -fn inline_code(text: &str) -> Inline { - let attr = ("".to_string(), vec![], vec![]); - Inline::Code(attr, text.to_string()) -} - -fn paragraph(inlines: &mut Vec<Inline>) -> Block { - Block::Para(std::mem::take(inlines)) -} - -fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block { - let attr = ("".to_string(), vec![], vec![]); - Block::Header(level, attr, std::mem::take(inlines)) -} - -fn image_block(dest: &str, title: &str) -> Block { - let attr = ("".to_string(), vec![], vec![]); - Block::Para(vec![Inline::Image( - attr, - vec![], - (dest.to_string(), title.to_string()), - )]) -} - -fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block { - trace!("code block: {:?}", kind); - let attr = if let CodeBlockKind::Fenced(lang) = kind { - trace!("fenced code block, lang={:?}", lang); - parse_code_block_attrs(lang) - } else { - trace!("indented code block"); - parse_code_block_attrs("") - }; - trace!("code block attrs: {:?}", attr); - let mut code = String::new(); - for inline in inlines.drain(0..) { - let text = plain_text_inline(inline); - code.push_str(&text); - } - // pulldown_cmark and pandoc differ in their codeblock handling, - // pulldown_cmark has an extra newline which we trim for now to be - // compatible with pandoc's parsing - if !code.is_empty() { - assert_eq!(code.pop(), Some('\n')); - } - Block::CodeBlock(attr, code) -} - -fn plain_text_inline(inline: Inline) -> String { - match inline { - Inline::Str(text) => text, - Inline::Code(_, text) => text, - Inline::Emph(inlines) => { - let mut text = String::new(); - for inline in inlines { - text.push_str(&plain_text_inline(inline)); - } - text - } - _ => panic!("not text in code block: {:?}", inline), - } -} - -fn parse_code_block_attrs(attrs: &str) -> Attr { - trace!("parsing code block attrs: {:?}", attrs); - let mut id = "".to_string(); - let mut classes = vec![]; - let mut keyvalues = vec![]; - if attrs.starts_with('{') && attrs.ends_with('}') { - let attrs = &attrs[1..attrs.len() - 1]; - for word in attrs.split_ascii_whitespace() { - if let Some(x) = word.strip_prefix('#') { - id = x.to_string(); - } else if let Some(x) = word.strip_prefix('.') { - classes.push(x.to_string()); - } else if let Some(i) = word.find('=') { - let k = &word[..i]; - let v = &word[i + 1..]; - keyvalues.push((k.to_string(), v.to_string())); - } - } - } else if !attrs.is_empty() { - classes.push(attrs.to_string()); - } - (id, classes, keyvalues) -} - -fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) { - let new_inlines = inlines.clone(); - inlines.clear(); - - let inline = match tag { - Tag::Emphasis => Inline::Emph(new_inlines), - Tag::Strong => Inline::Strong(new_inlines), - Tag::Strikethrough => Inline::Strikeout(new_inlines), - _ => unreachable!(), - }; - - inlines.push(inline); -} - /// Errors from Markdown parsing. #[derive(Debug, thiserror::Error)] pub enum Error { @@ -368,31 +171,10 @@ fn meta_path_bufs(v: &[PathBuf]) -> MetaValue { #[cfg(test)] mod test { - use super::{parse_code_block_attrs, YamlMetadata}; + use super::YamlMetadata; use std::path::{Path, PathBuf}; #[test] - fn code_block_attrs() { - assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![])); - assert_eq!( - parse_code_block_attrs("foo"), - ("".to_string(), vec!["foo".to_string()], vec![]) - ); - assert_eq!( - parse_code_block_attrs("{#foo}"), - ("foo".to_string(), vec![], vec![]) - ); - assert_eq!( - parse_code_block_attrs("{#foo .file bar=yo}"), - ( - "foo".to_string(), - vec!["file".to_string()], - vec![("bar".to_string(), "yo".to_string())] - ) - ); - } - - #[test] fn full_meta() { let meta = YamlMetadata::new( "\ @@ -74,4 +74,4 @@ mod codegen; pub use codegen::generate_test_program; mod ast; -pub use ast::{AbstractSyntaxTree, YamlMetadata}; +pub use ast::YamlMetadata; |