//! A representation of HTML using Rust types. #![deny(missing_docs)] use html_escape::{encode_double_quoted_attribute, encode_text}; use line_col::LineColLookup; use log::{debug, trace}; use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt::Write as _; use std::io::Write; use std::path::{Path, PathBuf}; const DOCTYPE: &str = ""; /// A HTML page, consisting of a head and a body. #[derive(Debug)] pub struct HtmlPage { head: Element, body: Element, } impl Default for HtmlPage { fn default() -> Self { Self { head: Element::new(ElementTag::Head), body: Element::new(ElementTag::Body), } } } impl HtmlPage { /// Create a new HTML page from a head and a body element. pub fn new(head: Element, body: Element) -> Self { Self { head, body } } /// Return the page's head element. pub fn head(&self) -> &Element { &self.head } /// Return the page's body element. pub fn body(&self) -> &Element { &self.body } /// Try to serialize an HTML page into HTML text. pub fn serialize(&self) -> Result { let mut html = Element::new(ElementTag::Html); html.push_child(Content::Elt(self.head.clone())); let mut body = Element::new(ElementTag::Body); body.push_child(Content::Elt(self.body.clone())); html.push_child(Content::Elt(body)); let html = html.serialize()?; Ok(format!("{}\n{}", DOCTYPE, html)) } /// Try to write an HTML page as text into a file. pub fn write(&self, filename: &Path) -> Result<(), HtmlError> { if let Some(parent) = filename.parent() { trace!("parent: {}", parent.display()); if !parent.exists() { debug!("creating directory {}", parent.display()); std::fs::create_dir_all(parent) .map_err(|e| HtmlError::CreateDir(parent.into(), e))?; } } trace!("writing HTML: {}", filename.display()); let mut f = std::fs::File::create(filename) .map_err(|e| HtmlError::CreateFile(filename.into(), e))?; let html = self.serialize()?; f.write_all(html.as_bytes()) .map_err(|e| HtmlError::FileWrite(filename.into(), e))?; Ok(()) } } /// Parse Markdown text into an HTML element. pub fn parse(filename: &Path, markdown: &str) -> Result { let mut options = Options::empty(); options.insert(Options::ENABLE_HEADING_ATTRIBUTES); options.insert(Options::ENABLE_STRIKETHROUGH); options.insert(Options::ENABLE_TABLES); options.insert(Options::ENABLE_TASKLISTS); let p = Parser::new_ext(markdown, options).into_offset_iter(); let linecol = LineColLookup::new(markdown); let mut stack = Stack::new(); stack.push(Element::new(ElementTag::Div)); let mut slugs = Slugs::default(); for (event, loc) in p { trace!("event {:?}", event); let (line, col) = linecol.get(loc.start); let loc = Location::new(filename, line, col); match event { Event::Start(tag) => match tag { Tag::Paragraph => stack.push_tag(ElementTag::P, loc), Tag::Heading(level, id, classes) => { let tag = match level { HeadingLevel::H1 => ElementTag::H1, HeadingLevel::H2 => ElementTag::H2, HeadingLevel::H3 => ElementTag::H3, HeadingLevel::H4 => ElementTag::H4, HeadingLevel::H5 => ElementTag::H5, HeadingLevel::H6 => ElementTag::H6, }; let mut h = Element::new(tag).with_location(loc); if let Some(id) = id { h.push_attribute(Attribute::new("id", id)); slugs.remember(id); } if !classes.is_empty() { let mut names = String::new(); for c in classes { if !names.is_empty() { names.push(' '); } names.push_str(c); } h.push_attribute(Attribute::new("class", &names)); } stack.push(h); } Tag::BlockQuote => stack.push_tag(ElementTag::Blockquote, loc), Tag::CodeBlock(kind) => { stack.push_tag(ElementTag::Pre, loc); if let CodeBlockKind::Fenced(attrs) = kind { let mut e = stack.pop(); e.set_block_attributes(BlockAttr::parse(&attrs)); stack.push(e); } } Tag::List(None) => stack.push_tag(ElementTag::Ul, loc), Tag::List(Some(start)) => { let mut e = Element::new(ElementTag::Ol).with_location(loc); e.push_attribute(Attribute::new("start", &format!("{}", start))); stack.push(e); } Tag::Item => stack.push_tag(ElementTag::Li, loc), Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag), Tag::Table(_) => stack.push_tag(ElementTag::Table, loc), Tag::TableHead => stack.push_tag(ElementTag::Th, loc), Tag::TableRow => stack.push_tag(ElementTag::Tr, loc), Tag::TableCell => stack.push_tag(ElementTag::Td, loc), Tag::Emphasis => stack.push_tag(ElementTag::Em, loc), Tag::Strong => stack.push_tag(ElementTag::Strong, loc), Tag::Strikethrough => stack.push_tag(ElementTag::Del, loc), Tag::Link(_, url, title) => { let mut link = Element::new(ElementTag::A); link.push_attribute(Attribute::new("href", url.as_ref())); if !title.is_empty() { link.push_attribute(Attribute::new("title", title.as_ref())); } stack.push(link); } Tag::Image(_, url, title) => { let mut e = Element::new(ElementTag::Img); e.push_attribute(Attribute::new("src", url.as_ref())); e.push_attribute(Attribute::new("alt", title.as_ref())); if !title.is_empty() { e.push_attribute(Attribute::new("title", title.as_ref())); } stack.push(e); } }, Event::End(tag) => match &tag { Tag::Paragraph => { trace!("at end of paragraph, looking for definition list use"); let e = stack.pop(); let s = as_plain_text(e.children()); trace!("paragraph text: {:?}", s); if s.starts_with(": ") || s.contains("\n: ") { return Err(HtmlError::DefinitionList(loc)); } stack.append_child(Content::Elt(e)); } Tag::Heading(_, _, _) => { let mut e = stack.pop(); if e.attr("id").is_none() { let slug = slugs.unique(&e.heading_slug()); let id = Attribute::new("id", &slug); e.push_attribute(id); } stack.append_child(Content::Elt(e)); } Tag::List(_) | Tag::Item | Tag::Link(_, _, _) | Tag::Image(_, _, _) | Tag::Emphasis | Tag::Table(_) | Tag::TableHead | Tag::TableRow | Tag::TableCell | Tag::Strong | Tag::Strikethrough | Tag::BlockQuote | Tag::CodeBlock(_) => { let e = stack.pop(); stack.append_child(Content::Elt(e)); } Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag), }, Event::Text(s) => stack.append_str(s.as_ref()), Event::Code(s) => { let mut code = Element::new(ElementTag::Code); code.push_child(Content::Text(s.to_string())); stack.append_element(code); } Event::Html(s) => stack.append_child(Content::Html(s.to_string())), Event::FootnoteReference(s) => trace!("footnote ref {:?}", s), Event::SoftBreak => stack.append_str("\n"), Event::HardBreak => stack.append_element(Element::new(ElementTag::Br)), Event::Rule => stack.append_element(Element::new(ElementTag::Hr)), Event::TaskListMarker(done) => { let marker = if done { "\u{2612} " // Unicode for box with X } else { "\u{2610} " // Unicode for empty box }; stack.append_str(marker); } } } let mut body = stack.pop(); assert!(stack.is_empty()); body.fix_up_img_alt(); Ok(body) } fn as_plain_text(content: &[Content]) -> String { let mut buf = String::new(); for c in content { if let Content::Text(s) = c { buf.push_str(s); } } buf } /// An HTML element. #[derive(Debug, Clone)] pub struct Element { loc: Option, tag: ElementTag, attrs: Vec, children: Vec, } impl Element { /// Create a new element. pub fn new(tag: ElementTag) -> Self { Self { loc: None, tag, attrs: vec![], children: vec![], } } fn with_location(mut self, loc: Location) -> Self { self.loc = Some(loc); self } /// Set location. pub fn set_location(&mut self, loc: Location) { self.loc = Some(loc); } /// Get location. pub fn location(&self) -> Location { if let Some(loc) = &self.loc { loc.clone() } else { Location::unknown() } } fn set_block_attributes(&mut self, block_attrs: Vec) { for block_attr in block_attrs { let attr = Attribute::from(block_attr); self.attrs.push(attr); } } /// Add a new attribute. pub fn push_attribute(&mut self, attr: Attribute) { self.attrs.push(attr); } /// Drop all attributes with a given name. pub fn drop_attributes(&mut self, unwanted: &[&str]) { for uw in unwanted { self.attrs.retain(|a| a.name() != *uw); } } /// Append a new child to the element. pub fn push_child(&mut self, child: Content) { self.children.push(child); } /// Return an element's tag. pub fn tag(&self) -> ElementTag { self.tag } /// All attributes. pub fn all_attrs(&self) -> &[Attribute] { &self.attrs } /// Return value of a named attribute, if any. pub fn attr(&self, name: &str) -> Option<&Attribute> { self.attrs.iter().find(|a| a.name() == name) } /// Has an attribute with a specific value? pub fn has_attr(&self, name: &str, wanted: &str) -> bool { self.attrs .iter() .filter(|a| a.name() == name && a.value() == Some(wanted)) .count() > 0 } fn heading_slug(&self) -> String { const SAFE: &str = "abcdefghijklmnopqrstuvwxyz"; let mut slug = String::new(); for s in self.content().to_lowercase().split_whitespace() { for c in s.chars() { if SAFE.contains(c) { slug.push(c); } } } slug } /// Return the concatenated text content of direct children, /// ignoring any elements. pub fn content(&self) -> String { let mut buf = String::new(); for child in self.children() { buf.push_str(&child.content()); } buf } /// Return all the children of an element. pub fn children(&self) -> &[Content] { &self.children } fn fix_up_img_alt(&mut self) { if self.tag == ElementTag::Img { if !self.attrs.iter().any(|a| a.name() == "alt") { let alt = as_plain_text(self.children()); self.push_attribute(Attribute::new("alt", &alt)); self.children.clear(); } } else { for child in self.children.iter_mut() { if let Content::Elt(kid) = child { kid.fix_up_img_alt(); } } } } /// Serialize an element into HTML text. pub fn serialize(&self) -> Result { let mut buf = String::new(); self.serialize_to_buf_without_added_newlines(&mut buf) .map_err(HtmlError::Format)?; Ok(buf) } fn serialize_to_buf_without_added_newlines( &self, buf: &mut String, ) -> Result<(), std::fmt::Error> { if self.children.is_empty() { write!(buf, "<{}", self.tag.name())?; self.serialize_attrs_to_buf(buf)?; write!(buf, "/>")?; } else { write!(buf, "<{}", self.tag.name())?; self.serialize_attrs_to_buf(buf)?; write!(buf, ">")?; for c in self.children() { match c { Content::Text(s) => buf.push_str(&encode_text(s)), Content::Elt(e) => e.serialize_to_buf_adding_block_newline(buf)?, Content::Html(s) => buf.push_str(s), } } write!(buf, "", self.tag.name())?; } Ok(()) } fn serialize_to_buf_adding_block_newline( &self, buf: &mut String, ) -> Result<(), std::fmt::Error> { if self.tag.is_block() { writeln!(buf)?; } self.serialize_to_buf_without_added_newlines(buf) } fn serialize_attrs_to_buf(&self, buf: &mut String) -> Result<(), std::fmt::Error> { let mut attrs = Attributes::default(); for attr in self.attrs.iter() { attrs.push(attr); } for (name, value) in attrs.iter() { write!(buf, " {}", name)?; if !value.is_empty() { write!(buf, "=\"{}\"", encode_double_quoted_attribute(value))?; } } Ok(()) } } /// The tag of an HTML element. #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[allow(missing_docs)] pub enum ElementTag { Html, Head, Meta, Body, Div, H1, H2, H3, H4, H5, H6, P, Ol, Ul, Li, Link, Blockquote, Pre, Em, Strong, Del, A, Img, Table, Title, Th, Tr, Td, Br, Hr, Code, Span, Style, } impl ElementTag { /// Name of the tag. pub fn name(&self) -> &str { match self { Self::Html => "html", Self::Head => "head", Self::Meta => "meta", Self::Body => "body", Self::Div => "div", Self::H1 => "h1", Self::H2 => "h2", Self::H3 => "h3", Self::H4 => "h4", Self::H5 => "h5", Self::H6 => "h6", Self::P => "p", Self::Ol => "ol", Self::Ul => "ul", Self::Li => "li", Self::Link => "link", Self::Blockquote => "blockquote", Self::Pre => "pre", Self::Em => "em", Self::Strong => "strong", Self::Del => "del", Self::A => "a", Self::Img => "img", Self::Table => "table", Self::Th => "th", Self::Title => "title", Self::Tr => "tr", Self::Td => "td", Self::Br => "br", Self::Hr => "hr", Self::Code => "code", Self::Span => "span", Self::Style => "style", } } fn is_block(&self) -> bool { matches!( self, Self::Html | Self::Head | Self::Meta | Self::Body | Self::Div | Self::H1 | Self::H2 | Self::H3 | Self::H4 | Self::H5 | Self::H6 | Self::P | Self::Ol | Self::Ul | Self::Li | Self::Blockquote | Self::Table | Self::Th | Self::Tr | Self::Br | Self::Hr ) } } #[derive(Debug, Default, Clone)] struct Attributes { attrs: HashMap, } impl Attributes { fn push(&mut self, attr: &Attribute) { if let Some(new_value) = attr.value() { if let Some(old_value) = self.attrs.get_mut(attr.name()) { assert!(!old_value.is_empty()); old_value.push(' '); old_value.push_str(new_value); } else { self.attrs.insert(attr.name().into(), new_value.into()); } } else { assert!(!self.attrs.contains_key(attr.name())); self.attrs.insert(attr.name().into(), "".into()); } } fn iter(&self) -> impl Iterator { self.attrs.iter() } } /// An attribute of an HTML element. #[derive(Clone, Debug)] pub struct Attribute { name: String, value: Option, } impl Attribute { /// Create a new element attribute. pub fn new(name: &str, value: &str) -> Self { Self { name: name.into(), value: Some(value.into()), } } /// Return the name of the attribute. pub fn name(&self) -> &str { &self.name } /// Return the value of the attribute, if any. pub fn value(&self) -> Option<&str> { self.value.as_deref() } } impl From for Attribute { fn from(block_attr: BlockAttr) -> Self { match block_attr { BlockAttr::Id(v) => Self::new("id", &v), BlockAttr::Class(v) => Self::new("class", &v), BlockAttr::KeyValue(k, v) => Self::new(&k, &v), } } } /// Content in HTML. #[derive(Clone, Debug)] pub enum Content { /// Arbitrary text. Text(String), /// An HTML element. Elt(Element), /// Arbitrary HTML text. Html(String), } impl Content { fn content(&self) -> String { match self { Self::Text(s) => s.clone(), Self::Elt(e) => e.content(), Self::Html(h) => h.clone(), } } } /// Location of element in source file. #[derive(Debug, Clone, Eq, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum Location { /// A known location. Known { /// Name of file. filename: PathBuf, /// Line in file. line: usize, /// Column in line. col: usize, }, /// An unknown location. Unknown, } impl Location { /// Create a new location. pub fn new(filename: &Path, line: usize, col: usize) -> Self { Self::Known { filename: filename.into(), line, col, } } /// Create an unknown location. pub fn unknown() -> Self { Self::Unknown } /// Report name of source file from where this element comes from. pub fn filename(&self) -> &Path { if let Self::Known { filename, line: _, col: _, } = self { filename } else { Path::new("") } } /// Report row and column in source where this element comes from. pub fn rowcol(&self) -> (usize, usize) { if let Self::Known { filename: _, line, col, } = self { (*line, *col) } else { (0, 0) } } } impl std::fmt::Display for Location { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { if let Self::Known { filename, line, col, } = self { write!(f, "{}:{}:{}", filename.display(), line, col) } else { write!(f, "(unknown location)") } } } struct Stack { stack: Vec, } impl Stack { fn new() -> Self { Self { stack: vec![] } } fn is_empty(&self) -> bool { self.stack.is_empty() } fn push(&mut self, e: Element) { trace!("pushed {:?}", e); self.stack.push(e); } fn push_tag(&mut self, tag: ElementTag, loc: Location) { self.push(Element::new(tag).with_location(loc)); } fn pop(&mut self) -> Element { let e = self.stack.pop().unwrap(); trace!("popped {:?}", e); e } fn append_child(&mut self, child: Content) { trace!("appended {:?}", child); let mut parent = self.stack.pop().unwrap(); parent.push_child(child); self.stack.push(parent); } fn append_str(&mut self, text: &str) { self.append_child(Content::Text(text.into())); } fn append_element(&mut self, e: Element) { self.append_child(Content::Elt(e)); } } /// Errors from the `html` module. #[derive(Debug, thiserror::Error)] pub enum HtmlError { /// Failed to create a directory. #[error("failed to create directory {0}")] CreateDir(PathBuf, #[source] std::io::Error), /// Failed to create a file. #[error("failed to create file {0}")] CreateFile(PathBuf, #[source] std::io::Error), /// Failed to write to a file. #[error("failed to write to file {0}")] FileWrite(PathBuf, #[source] std::io::Error), /// Input contains an attempt to use a definition list in /// Markdown. #[error("{0}: attempt to use definition lists in Markdown")] DefinitionList(Location), /// String formatting error. This is likely a programming error. #[error("string formatting error: {0}")] Format(#[source] std::fmt::Error), } /// Code block attribute. #[derive(Debug, Clone, Eq, PartialEq)] pub enum BlockAttr { /// An identifier. Id(String), /// A class. Class(String), /// A key/value pair. KeyValue(String, String), } impl BlockAttr { fn id(s: &str) -> Self { Self::Id(s.into()) } fn class(s: &str) -> Self { Self::Class(s.into()) } fn key_value(k: &str, v: &str) -> Self { Self::KeyValue(k.into(), v.into()) } /// Parse a fenced code block tag. pub fn parse(attrs: &str) -> Vec { let mut result = vec![]; for word in Self::parse_words(attrs) { let attr = Self::parse_word(word); result.push(attr); } result } fn parse_words(attrs: &str) -> impl Iterator { if attrs.starts_with('{') && attrs.ends_with('}') { attrs[1..attrs.len() - 1].split_ascii_whitespace() } else { attrs.split_ascii_whitespace() } } fn parse_word(word: &str) -> Self { if let Some(id) = word.strip_prefix('#') { Self::id(id) } else if let Some(class) = word.strip_prefix('.') { Self::class(class) } else if let Some((key, value)) = word.split_once('=') { Self::key_value(key, value) } else { Self::class(word) } } } #[cfg(test)] mod test_block_attr { use super::BlockAttr; #[test] fn empty_string() { assert_eq!(BlockAttr::parse(""), vec![]); } #[test] fn plain_word() { assert_eq!( BlockAttr::parse("foo"), vec![BlockAttr::Class("foo".into())] ); } #[test] fn dot_word() { assert_eq!( BlockAttr::parse(".foo"), vec![BlockAttr::Class("foo".into())] ); } #[test] fn hash_word() { assert_eq!(BlockAttr::parse("#foo"), vec![BlockAttr::Id("foo".into())]); } #[test] fn key_value() { assert_eq!( BlockAttr::parse("foo=bar"), vec![BlockAttr::KeyValue("foo".into(), "bar".into())] ); } #[test] fn several() { assert_eq!( BlockAttr::parse("{#foo .bar foobar yo=yoyo}"), vec![ BlockAttr::Id("foo".into()), BlockAttr::Class("bar".into()), BlockAttr::Class("foobar".into()), BlockAttr::KeyValue("yo".into(), "yoyo".into()), ] ); } } #[derive(Debug, Default)] struct Slugs { slugs: HashSet, } impl Slugs { const MAX: usize = 8; fn remember(&mut self, slug: &str) { self.slugs.insert(slug.into()); } fn unique(&mut self, candidate: &str) -> String { let slug = self.helper(candidate); self.remember(&slug); slug } fn helper(&mut self, candidate: &str) -> String { let mut slug0 = String::new(); for c in candidate.chars() { if slug0.len() >= Self::MAX { break; } slug0.push(c); } if !self.slugs.contains(&slug0) { return slug0.to_string(); } let mut i = 0; loop { i += 1; let slug = format!("{}{}", slug0, i); if !self.slugs.contains(&slug) { return slug; } } } }