summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars Wirzenius <liw@liw.fi>2023-04-05 18:44:35 +0300
committerLars Wirzenius <liw@liw.fi>2023-04-05 19:10:16 +0300
commit14cc8a66d199cab7b9407ee27a4fd352ba3baf38 (patch)
tree896c355d12433636e1c5d014187bf162e4b2c8d6
parent103e14b2da6aa0d2f5597beff128e630ab2fb292 (diff)
downloadsubplot-14cc8a66d199cab7b9407ee27a4fd352ba3baf38.tar.gz
feat: add an HTML representation using the Rust type system
Sponsored-by: author
-rw-r--r--src/html.rs724
-rw-r--r--src/lib.rs1
2 files changed, 725 insertions, 0 deletions
diff --git a/src/html.rs b/src/html.rs
new file mode 100644
index 0000000..bff9c75
--- /dev/null
+++ b/src/html.rs
@@ -0,0 +1,724 @@
+//! A representation of HTML using Rust types.
+
+#![deny(missing_docs)]
+
+use html_escape::{encode_double_quoted_attribute, encode_text};
+use line_col::LineColLookup;
+use log::{debug, trace};
+use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag};
+use std::fmt::Write as _;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+
+/// A HTML page, consisting of a head and a body.
+#[derive(Debug)]
+pub struct HtmlPage {
+ head: Element,
+ body: Element,
+}
+
+impl Default for HtmlPage {
+ fn default() -> Self {
+ Self {
+ head: Element::new(ElementTag::Head),
+ body: Element::new(ElementTag::Body),
+ }
+ }
+}
+
+impl HtmlPage {
+ /// Create a new HTML page from a head and a body element.
+ pub fn new(head: Element, body: Element) -> Self {
+ Self { head, body }
+ }
+
+ /// Return the page's head element.
+ pub fn head(&self) -> &Element {
+ &self.head
+ }
+
+ /// Return the page's body element.
+ pub fn body(&self) -> &Element {
+ &self.body
+ }
+
+ /// Try to serialize an HTML page into HTML text.
+ pub fn serialize(&self) -> Result<String, HtmlError> {
+ let mut html = Element::new(ElementTag::Html);
+ html.push_child(Content::Elt(self.head.clone()));
+ html.push_child(Content::Elt(self.body.clone()));
+ html.serialize()
+ }
+
+ /// Try to write an HTML page as text into a file.
+ pub fn write(&self, filename: &Path) -> Result<(), HtmlError> {
+ if let Some(parent) = filename.parent() {
+ trace!("parent: {}", parent.display());
+ if !parent.exists() {
+ debug!("creating directory {}", parent.display());
+ std::fs::create_dir_all(parent)
+ .map_err(|e| HtmlError::CreateDir(parent.into(), e))?;
+ }
+ }
+
+ trace!("writing HTML: {}", filename.display());
+ let mut f = std::fs::File::create(filename)
+ .map_err(|e| HtmlError::CreateFile(filename.into(), e))?;
+ let html = self.serialize()?;
+ f.write_all(html.as_bytes())
+ .map_err(|e| HtmlError::FileWrite(filename.into(), e))?;
+ Ok(())
+ }
+}
+
+/// Parse Markdown text into an HTML element.
+pub fn parse(markdown: &str) -> Result<Element, HtmlError> {
+ let mut options = Options::empty();
+ options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
+ options.insert(Options::ENABLE_STRIKETHROUGH);
+ options.insert(Options::ENABLE_TABLES);
+ options.insert(Options::ENABLE_TASKLISTS);
+ let p = Parser::new_ext(markdown, options).into_offset_iter();
+ let linecol = LineColLookup::new(markdown);
+ let mut stack = Stack::new();
+ stack.push(Element::new(ElementTag::Body));
+ for (event, loc) in p {
+ trace!("event {:?}", event);
+ let (line, col) = linecol.get(loc.start);
+ let loc = Location::new(line, col);
+ match event {
+ Event::Start(tag) => match tag {
+ Tag::Paragraph => stack.push_tag(ElementTag::P, loc),
+ Tag::Heading(level, id, classes) => {
+ let tag = match level {
+ HeadingLevel::H1 => ElementTag::H1,
+ HeadingLevel::H2 => ElementTag::H2,
+ HeadingLevel::H3 => ElementTag::H3,
+ HeadingLevel::H4 => ElementTag::H4,
+ HeadingLevel::H5 => ElementTag::H5,
+ HeadingLevel::H6 => ElementTag::H6,
+ };
+ let mut h = Element::new(tag);
+ if let Some(id) = id {
+ h.push_attribute(Attribute::new("id", id));
+ }
+ if !classes.is_empty() {
+ let mut names = String::new();
+ for c in classes {
+ if !names.is_empty() {
+ names.push(' ');
+ }
+ names.push_str(c);
+ }
+ h.push_attribute(Attribute::new("class", &names));
+ }
+ stack.push(h);
+ }
+ Tag::BlockQuote => stack.push_tag(ElementTag::Blockquote, loc),
+ Tag::CodeBlock(kind) => {
+ stack.push_tag(ElementTag::Pre, loc);
+ if let CodeBlockKind::Fenced(attrs) = kind {
+ let mut e = stack.pop();
+ e.set_block_attributes(BlockAttr::parse(&attrs));
+ stack.push(e);
+ }
+ }
+ Tag::List(None) => stack.push_tag(ElementTag::Ul, loc),
+ Tag::List(Some(start)) => {
+ let mut e = Element::new(ElementTag::Ol).with_location(loc);
+ e.push_attribute(Attribute::new("start", &format!("{}", start)));
+ stack.push(e);
+ }
+ Tag::Item => stack.push_tag(ElementTag::Li, loc),
+ Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag),
+ Tag::Table(_) => stack.push_tag(ElementTag::Table, loc),
+ Tag::TableHead => stack.push_tag(ElementTag::Th, loc),
+ Tag::TableRow => stack.push_tag(ElementTag::Tr, loc),
+ Tag::TableCell => stack.push_tag(ElementTag::Td, loc),
+ Tag::Emphasis => stack.push_tag(ElementTag::Em, loc),
+ Tag::Strong => stack.push_tag(ElementTag::Strong, loc),
+ Tag::Strikethrough => stack.push_tag(ElementTag::Del, loc),
+ Tag::Link(_, url, title) => {
+ let mut link = Element::new(ElementTag::A);
+ link.push_attribute(Attribute::new("href", url.as_ref()));
+ if !title.is_empty() {
+ link.push_attribute(Attribute::new("title", title.as_ref()));
+ }
+ stack.push(link);
+ }
+ Tag::Image(_, url, title) => {
+ let mut e = Element::new(ElementTag::Img);
+ e.push_attribute(Attribute::new("src", url.as_ref()));
+ if !title.is_empty() {
+ e.push_attribute(Attribute::new("title", title.as_ref()));
+ }
+ stack.push(e);
+ }
+ },
+ Event::End(tag) => match &tag {
+ Tag::Paragraph => {
+ trace!("at end of paragraph, looking for definition list use");
+ let e = stack.pop();
+ let s = as_plain_text(e.children());
+ trace!("paragraph text: {:?}", s);
+ if s.starts_with(": ") || s.contains("\n: ") {
+ return Err(HtmlError::DefinitionList(loc.line, loc.col));
+ }
+ stack.append_child(Content::Elt(e));
+ }
+ Tag::Heading(_, _, _)
+ | Tag::List(_)
+ | Tag::Item
+ | Tag::Link(_, _, _)
+ | Tag::Image(_, _, _)
+ | Tag::Emphasis
+ | Tag::Table(_)
+ | Tag::TableHead
+ | Tag::TableRow
+ | Tag::TableCell
+ | Tag::Strong
+ | Tag::Strikethrough
+ | Tag::BlockQuote
+ | Tag::CodeBlock(_) => {
+ let e = stack.pop();
+ stack.append_child(Content::Elt(e));
+ }
+ Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag),
+ },
+ Event::Text(s) => stack.append_str(s.as_ref()),
+ Event::Code(s) => {
+ let mut code = Element::new(ElementTag::Code);
+ code.push_child(Content::Text(s.to_string()));
+ stack.append_element(code);
+ }
+ Event::Html(s) => stack.append_child(Content::Html(s.to_string())),
+ Event::FootnoteReference(s) => trace!("footnote ref {:?}", s),
+ Event::SoftBreak => stack.append_str("\n"),
+ Event::HardBreak => stack.append_element(Element::new(ElementTag::Br)),
+ Event::Rule => stack.append_element(Element::new(ElementTag::Hr)),
+ Event::TaskListMarker(done) => {
+ let marker = if done {
+ "\u{2612} " // Unicode for box with X
+ } else {
+ "\u{2610} " // Unicode for empty box
+ };
+ stack.append_str(marker);
+ }
+ }
+ }
+
+ let mut body = stack.pop();
+ assert!(stack.is_empty());
+ body.fix_up_img_alt();
+ Ok(body)
+}
+
+fn as_plain_text(content: &[Content]) -> String {
+ let mut buf = String::new();
+ for c in content {
+ if let Content::Text(s) = c {
+ buf.push_str(s);
+ }
+ }
+ buf
+}
+
+/// An HTML element.
+#[derive(Debug, Clone)]
+pub struct Element {
+ loc: Option<Location>,
+ tag: ElementTag,
+ attrs: Vec<Attribute>,
+ children: Vec<Content>,
+}
+
+impl Element {
+ /// Create a new element.
+ pub fn new(tag: ElementTag) -> Self {
+ Self {
+ loc: None,
+ tag,
+ attrs: vec![],
+ children: vec![],
+ }
+ }
+
+ fn with_location(mut self, loc: Location) -> Self {
+ self.loc = Some(loc);
+ self
+ }
+
+ /// Set location.
+ pub fn set_location(&mut self, loc: Location) {
+ self.loc = Some(loc);
+ }
+
+ /// Get location.
+ pub fn location(&self) -> &Option<Location> {
+ &self.loc
+ }
+
+ fn set_block_attributes(&mut self, block_attrs: Vec<BlockAttr>) {
+ for block_attr in block_attrs {
+ let attr = Attribute::from(block_attr);
+ self.attrs.push(attr);
+ }
+ }
+
+ /// Add a new attribute.
+ pub fn push_attribute(&mut self, attr: Attribute) {
+ self.attrs.push(attr);
+ }
+
+ /// Append a new child to the element.
+ pub fn push_child(&mut self, child: Content) {
+ self.children.push(child);
+ }
+
+ /// Return an element's tag.
+ pub fn tag(&self) -> ElementTag {
+ self.tag
+ }
+
+ /// All attributes.
+ pub fn all_attrs(&self) -> &[Attribute] {
+ &self.attrs
+ }
+
+ /// Return value of a named attribute, if any.
+ pub fn attr(&self, name: &str) -> Option<&Attribute> {
+ self.attrs.iter().find(|a| a.name() == name)
+ }
+
+ /// Has an attribute with a specific value?
+ pub fn has_attr(&self, name: &str, wanted: &str) -> bool {
+ self.attrs
+ .iter()
+ .filter(|a| a.name() == name && a.value() == Some(wanted))
+ .count()
+ > 0
+ }
+
+ /// Return the concatenated text content of direct children,
+ /// ignoring any elements.
+ pub fn content(&self) -> String {
+ let mut buf = String::new();
+ for child in self.children() {
+ if let Content::Text(s) = child {
+ buf.push_str(s)
+ }
+ }
+ buf
+ }
+
+ /// Return all the children of an element.
+ pub fn children(&self) -> &[Content] {
+ &self.children
+ }
+
+ fn fix_up_img_alt(&mut self) {
+ if self.tag == ElementTag::Img {
+ let alt = as_plain_text(self.children());
+ self.push_attribute(Attribute::new("alt", &alt));
+ self.children.clear();
+ } else {
+ for child in self.children.iter_mut() {
+ if let Content::Elt(kid) = child {
+ kid.fix_up_img_alt();
+ }
+ }
+ }
+ }
+
+ /// Serialize an element into HTML text.
+ pub fn serialize(&self) -> Result<String, HtmlError> {
+ let mut buf = String::new();
+ self.serialize_to_buf_without_added_newlines(&mut buf)
+ .map_err(HtmlError::Format)?;
+ Ok(buf)
+ }
+
+ fn serialize_to_buf_without_added_newlines(
+ &self,
+ buf: &mut String,
+ ) -> Result<(), std::fmt::Error> {
+ if self.children.is_empty() {
+ write!(buf, "<{}", self.tag.name())?;
+ self.serialize_attrs_to_buf(buf)?;
+ write!(buf, "/>")?;
+ } else {
+ write!(buf, "<{}", self.tag.name())?;
+ self.serialize_attrs_to_buf(buf)?;
+ write!(buf, ">")?;
+ for c in self.children() {
+ match c {
+ Content::Text(s) => buf.push_str(&encode_text(s)),
+ Content::Elt(e) => e.serialize_to_buf_adding_block_newline(buf)?,
+ Content::Html(s) => buf.push_str(s),
+ }
+ }
+ write!(buf, "</{}>", self.tag.name())?;
+ }
+ Ok(())
+ }
+
+ fn serialize_to_buf_adding_block_newline(
+ &self,
+ buf: &mut String,
+ ) -> Result<(), std::fmt::Error> {
+ if self.tag.is_block() {
+ writeln!(buf)?;
+ }
+ self.serialize_to_buf_without_added_newlines(buf)
+ }
+
+ fn serialize_attrs_to_buf(&self, buf: &mut String) -> Result<(), std::fmt::Error> {
+ for attr in self.attrs.iter() {
+ write!(buf, " {}", attr.name())?;
+ if let Some(value) = attr.value() {
+ write!(buf, "=\"{}\"", encode_double_quoted_attribute(value))?;
+ }
+ }
+ Ok(())
+ }
+}
+
+/// The tag of an HTML element.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[allow(missing_docs)]
+pub enum ElementTag {
+ Html,
+ Head,
+ Meta,
+ Body,
+ Div,
+ H1,
+ H2,
+ H3,
+ H4,
+ H5,
+ H6,
+ P,
+ Ol,
+ Ul,
+ Li,
+ Blockquote,
+ Pre,
+ Em,
+ Strong,
+ Del,
+ A,
+ Img,
+ Table,
+ Title,
+ Th,
+ Tr,
+ Td,
+ Br,
+ Hr,
+ Code,
+}
+
+impl ElementTag {
+ /// Name of the tag.
+ pub fn name(&self) -> &str {
+ match self {
+ Self::Html => "html",
+ Self::Head => "head",
+ Self::Meta => "meta",
+ Self::Body => "body",
+ Self::Div => "div",
+ Self::H1 => "h1",
+ Self::H2 => "h2",
+ Self::H3 => "h3",
+ Self::H4 => "h4",
+ Self::H5 => "h5",
+ Self::H6 => "h6",
+ Self::P => "p",
+ Self::Ol => "ol",
+ Self::Ul => "ul",
+ Self::Li => "li",
+ Self::Blockquote => "blockquote",
+ Self::Pre => "pre",
+ Self::Em => "em",
+ Self::Strong => "strong",
+ Self::Del => "del",
+ Self::A => "a",
+ Self::Img => "img",
+ Self::Table => "table",
+ Self::Th => "th",
+ Self::Title => "title",
+ Self::Tr => "tr",
+ Self::Td => "td",
+ Self::Br => "br",
+ Self::Hr => "hr",
+ Self::Code => "code",
+ }
+ }
+
+ fn is_block(&self) -> bool {
+ matches!(
+ self,
+ Self::Html
+ | Self::Head
+ | Self::Meta
+ | Self::Body
+ | Self::Div
+ | Self::H1
+ | Self::H2
+ | Self::H3
+ | Self::H4
+ | Self::H5
+ | Self::H6
+ | Self::P
+ | Self::Ol
+ | Self::Ul
+ | Self::Li
+ | Self::Blockquote
+ | Self::Table
+ | Self::Th
+ | Self::Tr
+ | Self::Br
+ | Self::Hr
+ )
+ }
+}
+
+/// An attribute of an HTML element.
+#[derive(Clone, Debug)]
+pub struct Attribute {
+ name: String,
+ value: Option<String>,
+}
+
+impl Attribute {
+ /// Create a new element attribute.
+ pub fn new(name: &str, value: &str) -> Self {
+ Self {
+ name: name.into(),
+ value: Some(value.into()),
+ }
+ }
+
+ /// Return the name of the attribute.
+ pub fn name(&self) -> &str {
+ &self.name
+ }
+
+ /// Return the value of the attribute, if any.
+ pub fn value(&self) -> Option<&str> {
+ self.value.as_deref()
+ }
+}
+
+impl From<BlockAttr> for Attribute {
+ fn from(block_attr: BlockAttr) -> Self {
+ match block_attr {
+ BlockAttr::Id(v) => Self::new("id", &v),
+ BlockAttr::Class(v) => Self::new("class", &v),
+ BlockAttr::KeyValue(k, v) => Self::new(&k, &v),
+ }
+ }
+}
+
+/// Content in HTML.
+#[derive(Clone, Debug)]
+pub enum Content {
+ /// Arbitrary text.
+ Text(String),
+
+ /// An HTML element.
+ Elt(Element),
+
+ /// Arbitrary HTML text.
+ Html(String),
+}
+
+/// Location of element in source file.
+#[derive(Debug, Clone, Copy)]
+pub struct Location {
+ line: usize,
+ col: usize,
+}
+
+impl Location {
+ fn new(line: usize, col: usize) -> Self {
+ Self { line, col }
+ }
+}
+
+struct Stack {
+ stack: Vec<Element>,
+}
+
+impl Stack {
+ fn new() -> Self {
+ Self { stack: vec![] }
+ }
+
+ fn is_empty(&self) -> bool {
+ self.stack.is_empty()
+ }
+
+ fn push(&mut self, e: Element) {
+ trace!("pushed {:?}", e);
+ self.stack.push(e);
+ }
+
+ fn push_tag(&mut self, tag: ElementTag, loc: Location) {
+ self.push(Element::new(tag).with_location(loc));
+ }
+
+ fn pop(&mut self) -> Element {
+ let e = self.stack.pop().unwrap();
+ trace!("popped {:?}", e);
+ e
+ }
+
+ fn append_child(&mut self, child: Content) {
+ trace!("appended {:?}", child);
+ let mut parent = self.stack.pop().unwrap();
+ parent.push_child(child);
+ self.stack.push(parent);
+ }
+
+ fn append_str(&mut self, text: &str) {
+ self.append_child(Content::Text(text.into()));
+ }
+
+ fn append_element(&mut self, e: Element) {
+ self.append_child(Content::Elt(e));
+ }
+}
+
+/// Errors from the `html` module.
+#[derive(Debug, thiserror::Error)]
+pub enum HtmlError {
+ /// Failed to create a directory.
+ #[error("failed to create directory {0}")]
+ CreateDir(PathBuf, #[source] std::io::Error),
+
+ /// Failed to create a file.
+ #[error("failed to create file {0}")]
+ CreateFile(PathBuf, #[source] std::io::Error),
+
+ /// Failed to write to a file.
+ #[error("failed to write to file {0}")]
+ FileWrite(PathBuf, #[source] std::io::Error),
+
+ /// Input contains an attempt to use a definition list in
+ /// Markdown.
+ #[error("attempt to use definition lists in Markdown: line {0}, column {1}")]
+ DefinitionList(usize, usize),
+
+ /// String formatting error. This is likely a programming error.
+ #[error("string formatting error: {0}")]
+ Format(#[source] std::fmt::Error),
+}
+
+/// Code block attribute.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum BlockAttr {
+ /// An identifier.
+ Id(String),
+ /// A class.
+ Class(String),
+ /// A key/value pair.
+ KeyValue(String, String),
+}
+
+impl BlockAttr {
+ fn id(s: &str) -> Self {
+ Self::Id(s.into())
+ }
+
+ fn class(s: &str) -> Self {
+ Self::Class(s.into())
+ }
+
+ fn key_value(k: &str, v: &str) -> Self {
+ Self::KeyValue(k.into(), v.into())
+ }
+
+ /// Parse a fenced code block tag.
+ pub fn parse(attrs: &str) -> Vec<Self> {
+ let mut result = vec![];
+ for word in Self::parse_words(attrs) {
+ let attr = Self::parse_word(word);
+ result.push(attr);
+ }
+ result
+ }
+
+ fn parse_words(attrs: &str) -> impl Iterator<Item = &str> {
+ if attrs.starts_with('{') && attrs.ends_with('}') {
+ attrs[1..attrs.len() - 1].split_ascii_whitespace()
+ } else {
+ attrs.split_ascii_whitespace()
+ }
+ }
+
+ fn parse_word(word: &str) -> Self {
+ if let Some(id) = word.strip_prefix('#') {
+ Self::id(id)
+ } else if let Some(class) = word.strip_prefix('.') {
+ Self::class(class)
+ } else if let Some((key, value)) = word.split_once('=') {
+ Self::key_value(key, value)
+ } else {
+ Self::class(word)
+ }
+ }
+}
+
+#[cfg(test)]
+mod test_block_attr {
+ use super::BlockAttr;
+
+ #[test]
+ fn empty_string() {
+ assert_eq!(BlockAttr::parse(""), vec![]);
+ }
+
+ #[test]
+ fn plain_word() {
+ assert_eq!(
+ BlockAttr::parse("foo"),
+ vec![BlockAttr::Class("foo".into())]
+ );
+ }
+
+ #[test]
+ fn dot_word() {
+ assert_eq!(
+ BlockAttr::parse(".foo"),
+ vec![BlockAttr::Class("foo".into())]
+ );
+ }
+
+ #[test]
+ fn hash_word() {
+ assert_eq!(BlockAttr::parse("#foo"), vec![BlockAttr::Id("foo".into())]);
+ }
+
+ #[test]
+ fn key_value() {
+ assert_eq!(
+ BlockAttr::parse("foo=bar"),
+ vec![BlockAttr::KeyValue("foo".into(), "bar".into())]
+ );
+ }
+
+ #[test]
+ fn several() {
+ assert_eq!(
+ BlockAttr::parse("{#foo .bar foobar yo=yoyo}"),
+ vec![
+ BlockAttr::Id("foo".into()),
+ BlockAttr::Class("bar".into()),
+ BlockAttr::Class("foobar".into()),
+ BlockAttr::KeyValue("yo".into(), "yoyo".into()),
+ ]
+ );
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 90c183f..194db38 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,6 +34,7 @@ mod metadata;
pub use metadata::{Metadata, YamlMetadata};
mod doc;
+pub mod html;
pub mod md;
pub use doc::Document;
pub use doc::{codegen, load_document, load_document_with_pullmark};