summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2023-01-22 11:32:12 +0200
committerLars Wirzenius <liw@liw.fi>2023-01-22 11:35:58 +0200
commit27fac843cd0475026e96545b645c11f15a71d432 (patch)
tree073bad11a601df79c5ff1ddf24af1ad033094ad0
parentb940075214d31b632f62a0ba723e63b857f5e5f1 (diff)
downloadsubplot-27fac843cd0475026e96545b645c11f15a71d432.tar.gz
refactor: drop abstract syntax tree built using pulldown_cmark
We'll want to use the new Markdown type instead. Sponsored-by: author
-rw-r--r--src/ast.rs222
-rw-r--r--src/lib.rs2
2 files changed, 3 insertions, 221 deletions
diff --git a/src/ast.rs b/src/ast.rs
index 9e50af1..c48a1e7 100644
--- a/src/ast.rs
+++ b/src/ast.rs
@@ -1,7 +1,6 @@
use lazy_static::lazy_static;
use log::trace;
-use pandoc_ast::{Attr, Block, Inline, Map, MetaValue, Pandoc};
-use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
+use pandoc_ast::{Map, MetaValue};
use regex::Regex;
use serde::Deserialize;
use serde_yaml::{Mapping, Value};
@@ -17,202 +16,6 @@ lazy_static! {
static ref TRAILING_YAML_PATTERN: Regex = Regex::new(r"(?P<text>(.*\n)*)\n*(?P<yaml>-{3,}\n([^.].*\n)*\.{3,}\n)(?:\S*\n)*$").unwrap();
}
-/// An abstract syntax tree representation of a Markdown file.
-///
-/// This represents a Markdown file as an abstract syntax tree
-/// compatible with Pandoc's AST. The document YAML metadata MUST be
-/// at the top or bottom of the file, excluding leading or trailing
-/// empty lines.
-#[derive(Debug)]
-pub struct AbstractSyntaxTree {
- blocks: Vec<Block>,
- meta: YamlMetadata,
-}
-
-impl AbstractSyntaxTree {
- /// Create a new AST.
- pub fn new(meta: YamlMetadata, markdown: &str) -> Self {
- let blocks = parse_blocks(markdown);
- Self { blocks, meta }
- }
-
- /// Return a Pandoc-compatible AST.
- pub fn to_pandoc(&self) -> Pandoc {
- Pandoc {
- meta: self.meta.to_map(),
- blocks: self.blocks.clone(),
- pandoc_api_version: vec![1, 20],
- }
- }
-}
-
-// Parse Markdown into a sequence of Blocks.
-fn parse_blocks(markdown: &str) -> Vec<Block> {
- trace!("Parsing blocks");
-
- // Define the Markdown parser.
- let mut options = Options::empty();
- options.insert(Options::ENABLE_TABLES);
- options.insert(Options::ENABLE_FOOTNOTES);
- options.insert(Options::ENABLE_STRIKETHROUGH);
- options.insert(Options::ENABLE_TASKLISTS);
- options.insert(Options::ENABLE_SMART_PUNCTUATION);
- let parser = Parser::new_ext(markdown, options);
-
- // The sequence of blocks that represents the parsed document.
- let mut blocks = vec![];
-
- // The current set of inline elements we've collected. This gets
- // emptied whenever we finish a block.
- let mut inlines: Vec<Inline> = vec![];
-
- for event in parser {
- trace!("Parsing event: {:?}", event);
- match event {
- // We ignore these for now. They're not needed for codegen.
- Event::Html(_)
- | Event::FootnoteReference(_)
- | Event::SoftBreak
- | Event::HardBreak
- | Event::Rule
- | Event::TaskListMarker(_) => (),
-
- // Inline text of various kinds.
- Event::Text(text) => inlines.push(inline_text(&text)),
- Event::Code(text) => inlines.push(inline_code(&text)),
-
- // We only handle the end events.
- Event::Start(_) => (),
-
- // End of a block or inline.
- Event::End(tag) => match tag {
- // Collect inline elements for later inclusion in a block.
- Tag::Emphasis | Tag::Strong | Tag::Strikethrough => {
- inline_from_inlines(&tag, &mut inlines)
- }
- Tag::Paragraph => blocks.push(paragraph(&mut inlines)),
- Tag::Heading(level, _fragment, _classes) => {
- blocks.push(heading(level as i64, &mut inlines))
- }
- Tag::CodeBlock(kind) => blocks.push(code_block(&kind, &mut inlines)),
- Tag::Image(_link, dest, title) => blocks.push(image_block(&dest, &title)),
- // We don't handle anything else yet.
- _ => (),
- },
- }
- }
-
- // We MUST have emptied all inline elements.
- // assert!(inlines.is_empty());
-
- trace!("Parsing blocks: OK");
- blocks
-}
-
-fn inline_text(text: &str) -> Inline {
- Inline::Str(text.to_string())
-}
-
-fn inline_code(text: &str) -> Inline {
- let attr = ("".to_string(), vec![], vec![]);
- Inline::Code(attr, text.to_string())
-}
-
-fn paragraph(inlines: &mut Vec<Inline>) -> Block {
- Block::Para(std::mem::take(inlines))
-}
-
-fn heading(level: i64, inlines: &mut Vec<Inline>) -> Block {
- let attr = ("".to_string(), vec![], vec![]);
- Block::Header(level, attr, std::mem::take(inlines))
-}
-
-fn image_block(dest: &str, title: &str) -> Block {
- let attr = ("".to_string(), vec![], vec![]);
- Block::Para(vec![Inline::Image(
- attr,
- vec![],
- (dest.to_string(), title.to_string()),
- )])
-}
-
-fn code_block(kind: &CodeBlockKind, inlines: &mut Vec<Inline>) -> Block {
- trace!("code block: {:?}", kind);
- let attr = if let CodeBlockKind::Fenced(lang) = kind {
- trace!("fenced code block, lang={:?}", lang);
- parse_code_block_attrs(lang)
- } else {
- trace!("indented code block");
- parse_code_block_attrs("")
- };
- trace!("code block attrs: {:?}", attr);
- let mut code = String::new();
- for inline in inlines.drain(0..) {
- let text = plain_text_inline(inline);
- code.push_str(&text);
- }
- // pulldown_cmark and pandoc differ in their codeblock handling,
- // pulldown_cmark has an extra newline which we trim for now to be
- // compatible with pandoc's parsing
- if !code.is_empty() {
- assert_eq!(code.pop(), Some('\n'));
- }
- Block::CodeBlock(attr, code)
-}
-
-fn plain_text_inline(inline: Inline) -> String {
- match inline {
- Inline::Str(text) => text,
- Inline::Code(_, text) => text,
- Inline::Emph(inlines) => {
- let mut text = String::new();
- for inline in inlines {
- text.push_str(&plain_text_inline(inline));
- }
- text
- }
- _ => panic!("not text in code block: {:?}", inline),
- }
-}
-
-fn parse_code_block_attrs(attrs: &str) -> Attr {
- trace!("parsing code block attrs: {:?}", attrs);
- let mut id = "".to_string();
- let mut classes = vec![];
- let mut keyvalues = vec![];
- if attrs.starts_with('{') && attrs.ends_with('}') {
- let attrs = &attrs[1..attrs.len() - 1];
- for word in attrs.split_ascii_whitespace() {
- if let Some(x) = word.strip_prefix('#') {
- id = x.to_string();
- } else if let Some(x) = word.strip_prefix('.') {
- classes.push(x.to_string());
- } else if let Some(i) = word.find('=') {
- let k = &word[..i];
- let v = &word[i + 1..];
- keyvalues.push((k.to_string(), v.to_string()));
- }
- }
- } else if !attrs.is_empty() {
- classes.push(attrs.to_string());
- }
- (id, classes, keyvalues)
-}
-
-fn inline_from_inlines(tag: &Tag, inlines: &mut Vec<Inline>) {
- let new_inlines = inlines.clone();
- inlines.clear();
-
- let inline = match tag {
- Tag::Emphasis => Inline::Emph(new_inlines),
- Tag::Strong => Inline::Strong(new_inlines),
- Tag::Strikethrough => Inline::Strikeout(new_inlines),
- _ => unreachable!(),
- };
-
- inlines.push(inline);
-}
-
/// Errors from Markdown parsing.
#[derive(Debug, thiserror::Error)]
pub enum Error {
@@ -368,31 +171,10 @@ fn meta_path_bufs(v: &[PathBuf]) -> MetaValue {
#[cfg(test)]
mod test {
- use super::{parse_code_block_attrs, YamlMetadata};
+ use super::YamlMetadata;
use std::path::{Path, PathBuf};
#[test]
- fn code_block_attrs() {
- assert_eq!(parse_code_block_attrs(""), ("".to_string(), vec![], vec![]));
- assert_eq!(
- parse_code_block_attrs("foo"),
- ("".to_string(), vec!["foo".to_string()], vec![])
- );
- assert_eq!(
- parse_code_block_attrs("{#foo}"),
- ("foo".to_string(), vec![], vec![])
- );
- assert_eq!(
- parse_code_block_attrs("{#foo .file bar=yo}"),
- (
- "foo".to_string(),
- vec!["file".to_string()],
- vec![("bar".to_string(), "yo".to_string())]
- )
- );
- }
-
- #[test]
fn full_meta() {
let meta = YamlMetadata::new(
"\
diff --git a/src/lib.rs b/src/lib.rs
index 966118d..747b375 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -74,4 +74,4 @@ mod codegen;
pub use codegen::generate_test_program;
mod ast;
-pub use ast::{AbstractSyntaxTree, YamlMetadata};
+pub use ast::YamlMetadata;