diff options
author | Lars Wirzenius <liw@liw.fi> | 2023-04-05 18:44:35 +0300 |
---|---|---|
committer | Lars Wirzenius <liw@liw.fi> | 2023-04-05 19:10:16 +0300 |
commit | 14cc8a66d199cab7b9407ee27a4fd352ba3baf38 (patch) | |
tree | 896c355d12433636e1c5d014187bf162e4b2c8d6 | |
parent | 103e14b2da6aa0d2f5597beff128e630ab2fb292 (diff) | |
download | subplot-14cc8a66d199cab7b9407ee27a4fd352ba3baf38.tar.gz |
feat: add an HTML representation using the Rust type system
Sponsored-by: author
-rw-r--r-- | src/html.rs | 724 | ||||
-rw-r--r-- | src/lib.rs | 1 |
2 files changed, 725 insertions, 0 deletions
diff --git a/src/html.rs b/src/html.rs new file mode 100644 index 0000000..bff9c75 --- /dev/null +++ b/src/html.rs @@ -0,0 +1,724 @@ +//! A representation of HTML using Rust types. + +#![deny(missing_docs)] + +use html_escape::{encode_double_quoted_attribute, encode_text}; +use line_col::LineColLookup; +use log::{debug, trace}; +use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag}; +use std::fmt::Write as _; +use std::io::Write; +use std::path::{Path, PathBuf}; + +/// A HTML page, consisting of a head and a body. +#[derive(Debug)] +pub struct HtmlPage { + head: Element, + body: Element, +} + +impl Default for HtmlPage { + fn default() -> Self { + Self { + head: Element::new(ElementTag::Head), + body: Element::new(ElementTag::Body), + } + } +} + +impl HtmlPage { + /// Create a new HTML page from a head and a body element. + pub fn new(head: Element, body: Element) -> Self { + Self { head, body } + } + + /// Return the page's head element. + pub fn head(&self) -> &Element { + &self.head + } + + /// Return the page's body element. + pub fn body(&self) -> &Element { + &self.body + } + + /// Try to serialize an HTML page into HTML text. + pub fn serialize(&self) -> Result<String, HtmlError> { + let mut html = Element::new(ElementTag::Html); + html.push_child(Content::Elt(self.head.clone())); + html.push_child(Content::Elt(self.body.clone())); + html.serialize() + } + + /// Try to write an HTML page as text into a file. + pub fn write(&self, filename: &Path) -> Result<(), HtmlError> { + if let Some(parent) = filename.parent() { + trace!("parent: {}", parent.display()); + if !parent.exists() { + debug!("creating directory {}", parent.display()); + std::fs::create_dir_all(parent) + .map_err(|e| HtmlError::CreateDir(parent.into(), e))?; + } + } + + trace!("writing HTML: {}", filename.display()); + let mut f = std::fs::File::create(filename) + .map_err(|e| HtmlError::CreateFile(filename.into(), e))?; + let html = self.serialize()?; + f.write_all(html.as_bytes()) + .map_err(|e| HtmlError::FileWrite(filename.into(), e))?; + Ok(()) + } +} + +/// Parse Markdown text into an HTML element. +pub fn parse(markdown: &str) -> Result<Element, HtmlError> { + let mut options = Options::empty(); + options.insert(Options::ENABLE_HEADING_ATTRIBUTES); + options.insert(Options::ENABLE_STRIKETHROUGH); + options.insert(Options::ENABLE_TABLES); + options.insert(Options::ENABLE_TASKLISTS); + let p = Parser::new_ext(markdown, options).into_offset_iter(); + let linecol = LineColLookup::new(markdown); + let mut stack = Stack::new(); + stack.push(Element::new(ElementTag::Body)); + for (event, loc) in p { + trace!("event {:?}", event); + let (line, col) = linecol.get(loc.start); + let loc = Location::new(line, col); + match event { + Event::Start(tag) => match tag { + Tag::Paragraph => stack.push_tag(ElementTag::P, loc), + Tag::Heading(level, id, classes) => { + let tag = match level { + HeadingLevel::H1 => ElementTag::H1, + HeadingLevel::H2 => ElementTag::H2, + HeadingLevel::H3 => ElementTag::H3, + HeadingLevel::H4 => ElementTag::H4, + HeadingLevel::H5 => ElementTag::H5, + HeadingLevel::H6 => ElementTag::H6, + }; + let mut h = Element::new(tag); + if let Some(id) = id { + h.push_attribute(Attribute::new("id", id)); + } + if !classes.is_empty() { + let mut names = String::new(); + for c in classes { + if !names.is_empty() { + names.push(' '); + } + names.push_str(c); + } + h.push_attribute(Attribute::new("class", &names)); + } + stack.push(h); + } + Tag::BlockQuote => stack.push_tag(ElementTag::Blockquote, loc), + Tag::CodeBlock(kind) => { + stack.push_tag(ElementTag::Pre, loc); + if let CodeBlockKind::Fenced(attrs) = kind { + let mut e = stack.pop(); + e.set_block_attributes(BlockAttr::parse(&attrs)); + stack.push(e); + } + } + Tag::List(None) => stack.push_tag(ElementTag::Ul, loc), + Tag::List(Some(start)) => { + let mut e = Element::new(ElementTag::Ol).with_location(loc); + e.push_attribute(Attribute::new("start", &format!("{}", start))); + stack.push(e); + } + Tag::Item => stack.push_tag(ElementTag::Li, loc), + Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag), + Tag::Table(_) => stack.push_tag(ElementTag::Table, loc), + Tag::TableHead => stack.push_tag(ElementTag::Th, loc), + Tag::TableRow => stack.push_tag(ElementTag::Tr, loc), + Tag::TableCell => stack.push_tag(ElementTag::Td, loc), + Tag::Emphasis => stack.push_tag(ElementTag::Em, loc), + Tag::Strong => stack.push_tag(ElementTag::Strong, loc), + Tag::Strikethrough => stack.push_tag(ElementTag::Del, loc), + Tag::Link(_, url, title) => { + let mut link = Element::new(ElementTag::A); + link.push_attribute(Attribute::new("href", url.as_ref())); + if !title.is_empty() { + link.push_attribute(Attribute::new("title", title.as_ref())); + } + stack.push(link); + } + Tag::Image(_, url, title) => { + let mut e = Element::new(ElementTag::Img); + e.push_attribute(Attribute::new("src", url.as_ref())); + if !title.is_empty() { + e.push_attribute(Attribute::new("title", title.as_ref())); + } + stack.push(e); + } + }, + Event::End(tag) => match &tag { + Tag::Paragraph => { + trace!("at end of paragraph, looking for definition list use"); + let e = stack.pop(); + let s = as_plain_text(e.children()); + trace!("paragraph text: {:?}", s); + if s.starts_with(": ") || s.contains("\n: ") { + return Err(HtmlError::DefinitionList(loc.line, loc.col)); + } + stack.append_child(Content::Elt(e)); + } + Tag::Heading(_, _, _) + | Tag::List(_) + | Tag::Item + | Tag::Link(_, _, _) + | Tag::Image(_, _, _) + | Tag::Emphasis + | Tag::Table(_) + | Tag::TableHead + | Tag::TableRow + | Tag::TableCell + | Tag::Strong + | Tag::Strikethrough + | Tag::BlockQuote + | Tag::CodeBlock(_) => { + let e = stack.pop(); + stack.append_child(Content::Elt(e)); + } + Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag), + }, + Event::Text(s) => stack.append_str(s.as_ref()), + Event::Code(s) => { + let mut code = Element::new(ElementTag::Code); + code.push_child(Content::Text(s.to_string())); + stack.append_element(code); + } + Event::Html(s) => stack.append_child(Content::Html(s.to_string())), + Event::FootnoteReference(s) => trace!("footnote ref {:?}", s), + Event::SoftBreak => stack.append_str("\n"), + Event::HardBreak => stack.append_element(Element::new(ElementTag::Br)), + Event::Rule => stack.append_element(Element::new(ElementTag::Hr)), + Event::TaskListMarker(done) => { + let marker = if done { + "\u{2612} " // Unicode for box with X + } else { + "\u{2610} " // Unicode for empty box + }; + stack.append_str(marker); + } + } + } + + let mut body = stack.pop(); + assert!(stack.is_empty()); + body.fix_up_img_alt(); + Ok(body) +} + +fn as_plain_text(content: &[Content]) -> String { + let mut buf = String::new(); + for c in content { + if let Content::Text(s) = c { + buf.push_str(s); + } + } + buf +} + +/// An HTML element. +#[derive(Debug, Clone)] +pub struct Element { + loc: Option<Location>, + tag: ElementTag, + attrs: Vec<Attribute>, + children: Vec<Content>, +} + +impl Element { + /// Create a new element. + pub fn new(tag: ElementTag) -> Self { + Self { + loc: None, + tag, + attrs: vec![], + children: vec![], + } + } + + fn with_location(mut self, loc: Location) -> Self { + self.loc = Some(loc); + self + } + + /// Set location. + pub fn set_location(&mut self, loc: Location) { + self.loc = Some(loc); + } + + /// Get location. + pub fn location(&self) -> &Option<Location> { + &self.loc + } + + fn set_block_attributes(&mut self, block_attrs: Vec<BlockAttr>) { + for block_attr in block_attrs { + let attr = Attribute::from(block_attr); + self.attrs.push(attr); + } + } + + /// Add a new attribute. + pub fn push_attribute(&mut self, attr: Attribute) { + self.attrs.push(attr); + } + + /// Append a new child to the element. + pub fn push_child(&mut self, child: Content) { + self.children.push(child); + } + + /// Return an element's tag. + pub fn tag(&self) -> ElementTag { + self.tag + } + + /// All attributes. + pub fn all_attrs(&self) -> &[Attribute] { + &self.attrs + } + + /// Return value of a named attribute, if any. + pub fn attr(&self, name: &str) -> Option<&Attribute> { + self.attrs.iter().find(|a| a.name() == name) + } + + /// Has an attribute with a specific value? + pub fn has_attr(&self, name: &str, wanted: &str) -> bool { + self.attrs + .iter() + .filter(|a| a.name() == name && a.value() == Some(wanted)) + .count() + > 0 + } + + /// Return the concatenated text content of direct children, + /// ignoring any elements. + pub fn content(&self) -> String { + let mut buf = String::new(); + for child in self.children() { + if let Content::Text(s) = child { + buf.push_str(s) + } + } + buf + } + + /// Return all the children of an element. + pub fn children(&self) -> &[Content] { + &self.children + } + + fn fix_up_img_alt(&mut self) { + if self.tag == ElementTag::Img { + let alt = as_plain_text(self.children()); + self.push_attribute(Attribute::new("alt", &alt)); + self.children.clear(); + } else { + for child in self.children.iter_mut() { + if let Content::Elt(kid) = child { + kid.fix_up_img_alt(); + } + } + } + } + + /// Serialize an element into HTML text. + pub fn serialize(&self) -> Result<String, HtmlError> { + let mut buf = String::new(); + self.serialize_to_buf_without_added_newlines(&mut buf) + .map_err(HtmlError::Format)?; + Ok(buf) + } + + fn serialize_to_buf_without_added_newlines( + &self, + buf: &mut String, + ) -> Result<(), std::fmt::Error> { + if self.children.is_empty() { + write!(buf, "<{}", self.tag.name())?; + self.serialize_attrs_to_buf(buf)?; + write!(buf, "/>")?; + } else { + write!(buf, "<{}", self.tag.name())?; + self.serialize_attrs_to_buf(buf)?; + write!(buf, ">")?; + for c in self.children() { + match c { + Content::Text(s) => buf.push_str(&encode_text(s)), + Content::Elt(e) => e.serialize_to_buf_adding_block_newline(buf)?, + Content::Html(s) => buf.push_str(s), + } + } + write!(buf, "</{}>", self.tag.name())?; + } + Ok(()) + } + + fn serialize_to_buf_adding_block_newline( + &self, + buf: &mut String, + ) -> Result<(), std::fmt::Error> { + if self.tag.is_block() { + writeln!(buf)?; + } + self.serialize_to_buf_without_added_newlines(buf) + } + + fn serialize_attrs_to_buf(&self, buf: &mut String) -> Result<(), std::fmt::Error> { + for attr in self.attrs.iter() { + write!(buf, " {}", attr.name())?; + if let Some(value) = attr.value() { + write!(buf, "=\"{}\"", encode_double_quoted_attribute(value))?; + } + } + Ok(()) + } +} + +/// The tag of an HTML element. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[allow(missing_docs)] +pub enum ElementTag { + Html, + Head, + Meta, + Body, + Div, + H1, + H2, + H3, + H4, + H5, + H6, + P, + Ol, + Ul, + Li, + Blockquote, + Pre, + Em, + Strong, + Del, + A, + Img, + Table, + Title, + Th, + Tr, + Td, + Br, + Hr, + Code, +} + +impl ElementTag { + /// Name of the tag. + pub fn name(&self) -> &str { + match self { + Self::Html => "html", + Self::Head => "head", + Self::Meta => "meta", + Self::Body => "body", + Self::Div => "div", + Self::H1 => "h1", + Self::H2 => "h2", + Self::H3 => "h3", + Self::H4 => "h4", + Self::H5 => "h5", + Self::H6 => "h6", + Self::P => "p", + Self::Ol => "ol", + Self::Ul => "ul", + Self::Li => "li", + Self::Blockquote => "blockquote", + Self::Pre => "pre", + Self::Em => "em", + Self::Strong => "strong", + Self::Del => "del", + Self::A => "a", + Self::Img => "img", + Self::Table => "table", + Self::Th => "th", + Self::Title => "title", + Self::Tr => "tr", + Self::Td => "td", + Self::Br => "br", + Self::Hr => "hr", + Self::Code => "code", + } + } + + fn is_block(&self) -> bool { + matches!( + self, + Self::Html + | Self::Head + | Self::Meta + | Self::Body + | Self::Div + | Self::H1 + | Self::H2 + | Self::H3 + | Self::H4 + | Self::H5 + | Self::H6 + | Self::P + | Self::Ol + | Self::Ul + | Self::Li + | Self::Blockquote + | Self::Table + | Self::Th + | Self::Tr + | Self::Br + | Self::Hr + ) + } +} + +/// An attribute of an HTML element. +#[derive(Clone, Debug)] +pub struct Attribute { + name: String, + value: Option<String>, +} + +impl Attribute { + /// Create a new element attribute. + pub fn new(name: &str, value: &str) -> Self { + Self { + name: name.into(), + value: Some(value.into()), + } + } + + /// Return the name of the attribute. + pub fn name(&self) -> &str { + &self.name + } + + /// Return the value of the attribute, if any. + pub fn value(&self) -> Option<&str> { + self.value.as_deref() + } +} + +impl From<BlockAttr> for Attribute { + fn from(block_attr: BlockAttr) -> Self { + match block_attr { + BlockAttr::Id(v) => Self::new("id", &v), + BlockAttr::Class(v) => Self::new("class", &v), + BlockAttr::KeyValue(k, v) => Self::new(&k, &v), + } + } +} + +/// Content in HTML. +#[derive(Clone, Debug)] +pub enum Content { + /// Arbitrary text. + Text(String), + + /// An HTML element. + Elt(Element), + + /// Arbitrary HTML text. + Html(String), +} + +/// Location of element in source file. +#[derive(Debug, Clone, Copy)] +pub struct Location { + line: usize, + col: usize, +} + +impl Location { + fn new(line: usize, col: usize) -> Self { + Self { line, col } + } +} + +struct Stack { + stack: Vec<Element>, +} + +impl Stack { + fn new() -> Self { + Self { stack: vec![] } + } + + fn is_empty(&self) -> bool { + self.stack.is_empty() + } + + fn push(&mut self, e: Element) { + trace!("pushed {:?}", e); + self.stack.push(e); + } + + fn push_tag(&mut self, tag: ElementTag, loc: Location) { + self.push(Element::new(tag).with_location(loc)); + } + + fn pop(&mut self) -> Element { + let e = self.stack.pop().unwrap(); + trace!("popped {:?}", e); + e + } + + fn append_child(&mut self, child: Content) { + trace!("appended {:?}", child); + let mut parent = self.stack.pop().unwrap(); + parent.push_child(child); + self.stack.push(parent); + } + + fn append_str(&mut self, text: &str) { + self.append_child(Content::Text(text.into())); + } + + fn append_element(&mut self, e: Element) { + self.append_child(Content::Elt(e)); + } +} + +/// Errors from the `html` module. +#[derive(Debug, thiserror::Error)] +pub enum HtmlError { + /// Failed to create a directory. + #[error("failed to create directory {0}")] + CreateDir(PathBuf, #[source] std::io::Error), + + /// Failed to create a file. + #[error("failed to create file {0}")] + CreateFile(PathBuf, #[source] std::io::Error), + + /// Failed to write to a file. + #[error("failed to write to file {0}")] + FileWrite(PathBuf, #[source] std::io::Error), + + /// Input contains an attempt to use a definition list in + /// Markdown. + #[error("attempt to use definition lists in Markdown: line {0}, column {1}")] + DefinitionList(usize, usize), + + /// String formatting error. This is likely a programming error. + #[error("string formatting error: {0}")] + Format(#[source] std::fmt::Error), +} + +/// Code block attribute. +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum BlockAttr { + /// An identifier. + Id(String), + /// A class. + Class(String), + /// A key/value pair. + KeyValue(String, String), +} + +impl BlockAttr { + fn id(s: &str) -> Self { + Self::Id(s.into()) + } + + fn class(s: &str) -> Self { + Self::Class(s.into()) + } + + fn key_value(k: &str, v: &str) -> Self { + Self::KeyValue(k.into(), v.into()) + } + + /// Parse a fenced code block tag. + pub fn parse(attrs: &str) -> Vec<Self> { + let mut result = vec![]; + for word in Self::parse_words(attrs) { + let attr = Self::parse_word(word); + result.push(attr); + } + result + } + + fn parse_words(attrs: &str) -> impl Iterator<Item = &str> { + if attrs.starts_with('{') && attrs.ends_with('}') { + attrs[1..attrs.len() - 1].split_ascii_whitespace() + } else { + attrs.split_ascii_whitespace() + } + } + + fn parse_word(word: &str) -> Self { + if let Some(id) = word.strip_prefix('#') { + Self::id(id) + } else if let Some(class) = word.strip_prefix('.') { + Self::class(class) + } else if let Some((key, value)) = word.split_once('=') { + Self::key_value(key, value) + } else { + Self::class(word) + } + } +} + +#[cfg(test)] +mod test_block_attr { + use super::BlockAttr; + + #[test] + fn empty_string() { + assert_eq!(BlockAttr::parse(""), vec![]); + } + + #[test] + fn plain_word() { + assert_eq!( + BlockAttr::parse("foo"), + vec![BlockAttr::Class("foo".into())] + ); + } + + #[test] + fn dot_word() { + assert_eq!( + BlockAttr::parse(".foo"), + vec![BlockAttr::Class("foo".into())] + ); + } + + #[test] + fn hash_word() { + assert_eq!(BlockAttr::parse("#foo"), vec![BlockAttr::Id("foo".into())]); + } + + #[test] + fn key_value() { + assert_eq!( + BlockAttr::parse("foo=bar"), + vec![BlockAttr::KeyValue("foo".into(), "bar".into())] + ); + } + + #[test] + fn several() { + assert_eq!( + BlockAttr::parse("{#foo .bar foobar yo=yoyo}"), + vec![ + BlockAttr::Id("foo".into()), + BlockAttr::Class("bar".into()), + BlockAttr::Class("foobar".into()), + BlockAttr::KeyValue("yo".into(), "yoyo".into()), + ] + ); + } +} @@ -34,6 +34,7 @@ mod metadata; pub use metadata::{Metadata, YamlMetadata}; mod doc; +pub mod html; pub mod md; pub use doc::Document; pub use doc::{codegen, load_document, load_document_with_pullmark}; |