//! A representation of HTML using Rust types.
#![deny(missing_docs)]
use html_escape::{encode_double_quoted_attribute, encode_text};
use line_col::LineColLookup;
use log::{debug, trace};
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Options, Parser, Tag};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fmt::Write as _;
use std::io::Write;
use std::path::{Path, PathBuf};
const DOCTYPE: &str = "";
/// A HTML page, consisting of a head and a body.
#[derive(Debug)]
pub struct HtmlPage {
head: Element,
body: Element,
}
impl Default for HtmlPage {
fn default() -> Self {
Self {
head: Element::new(ElementTag::Head),
body: Element::new(ElementTag::Body),
}
}
}
impl HtmlPage {
/// Create a new HTML page from a head and a body element.
pub fn new(head: Element, body: Element) -> Self {
Self { head, body }
}
/// Return the page's head element.
pub fn head(&self) -> &Element {
&self.head
}
/// Return the page's body element.
pub fn body(&self) -> &Element {
&self.body
}
/// Try to serialize an HTML page into HTML text.
pub fn serialize(&self) -> Result {
let mut html = Element::new(ElementTag::Html);
html.push_child(Content::Elt(self.head.clone()));
let mut body = Element::new(ElementTag::Body);
body.push_child(Content::Elt(self.body.clone()));
html.push_child(Content::Elt(body));
let html = html.serialize()?;
Ok(format!("{}\n{}", DOCTYPE, html))
}
/// Try to write an HTML page as text into a file.
pub fn write(&self, filename: &Path) -> Result<(), HtmlError> {
if let Some(parent) = filename.parent() {
trace!("parent: {}", parent.display());
if !parent.exists() {
debug!("creating directory {}", parent.display());
std::fs::create_dir_all(parent)
.map_err(|e| HtmlError::CreateDir(parent.into(), e))?;
}
}
trace!("writing HTML: {}", filename.display());
let mut f = std::fs::File::create(filename)
.map_err(|e| HtmlError::CreateFile(filename.into(), e))?;
let html = self.serialize()?;
f.write_all(html.as_bytes())
.map_err(|e| HtmlError::FileWrite(filename.into(), e))?;
Ok(())
}
}
/// Parse Markdown text into an HTML element.
pub fn parse(filename: &Path, markdown: &str) -> Result {
let mut options = Options::empty();
options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
options.insert(Options::ENABLE_STRIKETHROUGH);
options.insert(Options::ENABLE_TABLES);
options.insert(Options::ENABLE_TASKLISTS);
let p = Parser::new_ext(markdown, options).into_offset_iter();
let linecol = LineColLookup::new(markdown);
let mut stack = Stack::new();
stack.push(Element::new(ElementTag::Div));
let mut slugs = Slugs::default();
for (event, loc) in p {
trace!("event {:?}", event);
let (line, col) = linecol.get(loc.start);
let loc = Location::new(filename, line, col);
match event {
Event::Start(tag) => match tag {
Tag::Paragraph => stack.push_tag(ElementTag::P, loc),
Tag::Heading(level, id, classes) => {
let tag = match level {
HeadingLevel::H1 => ElementTag::H1,
HeadingLevel::H2 => ElementTag::H2,
HeadingLevel::H3 => ElementTag::H3,
HeadingLevel::H4 => ElementTag::H4,
HeadingLevel::H5 => ElementTag::H5,
HeadingLevel::H6 => ElementTag::H6,
};
let mut h = Element::new(tag).with_location(loc);
if let Some(id) = id {
h.push_attribute(Attribute::new("id", id));
slugs.remember(id);
}
if !classes.is_empty() {
let mut names = String::new();
for c in classes {
if !names.is_empty() {
names.push(' ');
}
names.push_str(c);
}
h.push_attribute(Attribute::new("class", &names));
}
stack.push(h);
}
Tag::BlockQuote => stack.push_tag(ElementTag::Blockquote, loc),
Tag::CodeBlock(kind) => {
stack.push_tag(ElementTag::Pre, loc);
if let CodeBlockKind::Fenced(attrs) = kind {
let mut e = stack.pop();
e.set_block_attributes(BlockAttr::parse(&attrs));
stack.push(e);
}
}
Tag::List(None) => stack.push_tag(ElementTag::Ul, loc),
Tag::List(Some(start)) => {
let mut e = Element::new(ElementTag::Ol).with_location(loc);
e.push_attribute(Attribute::new("start", &format!("{}", start)));
stack.push(e);
}
Tag::Item => stack.push_tag(ElementTag::Li, loc),
Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag),
Tag::Table(_) => stack.push_tag(ElementTag::Table, loc),
Tag::TableHead => stack.push_tag(ElementTag::Th, loc),
Tag::TableRow => stack.push_tag(ElementTag::Tr, loc),
Tag::TableCell => stack.push_tag(ElementTag::Td, loc),
Tag::Emphasis => stack.push_tag(ElementTag::Em, loc),
Tag::Strong => stack.push_tag(ElementTag::Strong, loc),
Tag::Strikethrough => stack.push_tag(ElementTag::Del, loc),
Tag::Link(_, url, title) => {
let mut link = Element::new(ElementTag::A);
link.push_attribute(Attribute::new("href", url.as_ref()));
if !title.is_empty() {
link.push_attribute(Attribute::new("title", title.as_ref()));
}
stack.push(link);
}
Tag::Image(_, url, title) => {
let mut e = Element::new(ElementTag::Img);
e.push_attribute(Attribute::new("src", url.as_ref()));
e.push_attribute(Attribute::new("alt", title.as_ref()));
if !title.is_empty() {
e.push_attribute(Attribute::new("title", title.as_ref()));
}
stack.push(e);
}
},
Event::End(tag) => match &tag {
Tag::Paragraph => {
trace!("at end of paragraph, looking for definition list use");
let e = stack.pop();
let s = as_plain_text(e.children());
trace!("paragraph text: {:?}", s);
if s.starts_with(": ") || s.contains("\n: ") {
return Err(HtmlError::DefinitionList(loc));
}
stack.append_child(Content::Elt(e));
}
Tag::Heading(_, _, _) => {
let mut e = stack.pop();
if e.attr("id").is_none() {
let slug = slugs.unique(&e.heading_slug());
let id = Attribute::new("id", &slug);
e.push_attribute(id);
}
stack.append_child(Content::Elt(e));
}
Tag::List(_)
| Tag::Item
| Tag::Link(_, _, _)
| Tag::Image(_, _, _)
| Tag::Emphasis
| Tag::Table(_)
| Tag::TableHead
| Tag::TableRow
| Tag::TableCell
| Tag::Strong
| Tag::Strikethrough
| Tag::BlockQuote
| Tag::CodeBlock(_) => {
let e = stack.pop();
stack.append_child(Content::Elt(e));
}
Tag::FootnoteDefinition(_) => unreachable!("{:?}", tag),
},
Event::Text(s) => stack.append_str(s.as_ref()),
Event::Code(s) => {
let mut code = Element::new(ElementTag::Code);
code.push_child(Content::Text(s.to_string()));
stack.append_element(code);
}
Event::Html(s) => stack.append_child(Content::Html(s.to_string())),
Event::FootnoteReference(s) => trace!("footnote ref {:?}", s),
Event::SoftBreak => stack.append_str("\n"),
Event::HardBreak => stack.append_element(Element::new(ElementTag::Br)),
Event::Rule => stack.append_element(Element::new(ElementTag::Hr)),
Event::TaskListMarker(done) => {
let marker = if done {
"\u{2612} " // Unicode for box with X
} else {
"\u{2610} " // Unicode for empty box
};
stack.append_str(marker);
}
}
}
let mut body = stack.pop();
assert!(stack.is_empty());
body.fix_up_img_alt();
Ok(body)
}
fn as_plain_text(content: &[Content]) -> String {
let mut buf = String::new();
for c in content {
if let Content::Text(s) = c {
buf.push_str(s);
}
}
buf
}
/// An HTML element.
#[derive(Debug, Clone)]
pub struct Element {
loc: Option,
tag: ElementTag,
attrs: Vec,
children: Vec,
}
impl Element {
/// Create a new element.
pub fn new(tag: ElementTag) -> Self {
Self {
loc: None,
tag,
attrs: vec![],
children: vec![],
}
}
fn with_location(mut self, loc: Location) -> Self {
self.loc = Some(loc);
self
}
/// Set location.
pub fn set_location(&mut self, loc: Location) {
self.loc = Some(loc);
}
/// Get location.
pub fn location(&self) -> Location {
if let Some(loc) = &self.loc {
loc.clone()
} else {
Location::unknown()
}
}
fn set_block_attributes(&mut self, block_attrs: Vec) {
for block_attr in block_attrs {
let attr = Attribute::from(block_attr);
self.attrs.push(attr);
}
}
/// Add a new attribute.
pub fn push_attribute(&mut self, attr: Attribute) {
self.attrs.push(attr);
}
/// Drop all attributes with a given name.
pub fn drop_attributes(&mut self, unwanted: &[&str]) {
for uw in unwanted {
self.attrs.retain(|a| a.name() != *uw);
}
}
/// Append a new child to the element.
pub fn push_child(&mut self, child: Content) {
self.children.push(child);
}
/// Return an element's tag.
pub fn tag(&self) -> ElementTag {
self.tag
}
/// All attributes.
pub fn all_attrs(&self) -> &[Attribute] {
&self.attrs
}
/// Return value of a named attribute, if any.
pub fn attr(&self, name: &str) -> Option<&Attribute> {
self.attrs.iter().find(|a| a.name() == name)
}
/// Has an attribute with a specific value?
pub fn has_attr(&self, name: &str, wanted: &str) -> bool {
self.attrs
.iter()
.filter(|a| a.name() == name && a.value() == Some(wanted))
.count()
> 0
}
fn heading_slug(&self) -> String {
const SAFE: &str = "abcdefghijklmnopqrstuvwxyz";
let mut slug = String::new();
for s in self.content().to_lowercase().split_whitespace() {
for c in s.chars() {
if SAFE.contains(c) {
slug.push(c);
}
}
}
slug
}
/// Return the concatenated text content of direct children,
/// ignoring any elements.
pub fn content(&self) -> String {
let mut buf = String::new();
for child in self.children() {
buf.push_str(&child.content());
}
buf
}
/// Return all the children of an element.
pub fn children(&self) -> &[Content] {
&self.children
}
fn fix_up_img_alt(&mut self) {
if self.tag == ElementTag::Img {
if !self.attrs.iter().any(|a| a.name() == "alt") {
let alt = as_plain_text(self.children());
self.push_attribute(Attribute::new("alt", &alt));
self.children.clear();
}
} else {
for child in self.children.iter_mut() {
if let Content::Elt(kid) = child {
kid.fix_up_img_alt();
}
}
}
}
/// Serialize an element into HTML text.
pub fn serialize(&self) -> Result {
let mut buf = String::new();
self.serialize_to_buf_without_added_newlines(&mut buf)
.map_err(HtmlError::Format)?;
Ok(buf)
}
fn serialize_to_buf_without_added_newlines(
&self,
buf: &mut String,
) -> Result<(), std::fmt::Error> {
if self.children.is_empty() {
write!(buf, "<{}", self.tag.name())?;
self.serialize_attrs_to_buf(buf)?;
write!(buf, "/>")?;
} else {
write!(buf, "<{}", self.tag.name())?;
self.serialize_attrs_to_buf(buf)?;
write!(buf, ">")?;
for c in self.children() {
match c {
Content::Text(s) => buf.push_str(&encode_text(s)),
Content::Elt(e) => e.serialize_to_buf_adding_block_newline(buf)?,
Content::Html(s) => buf.push_str(s),
}
}
write!(buf, "{}>", self.tag.name())?;
}
Ok(())
}
fn serialize_to_buf_adding_block_newline(
&self,
buf: &mut String,
) -> Result<(), std::fmt::Error> {
if self.tag.is_block() {
writeln!(buf)?;
}
self.serialize_to_buf_without_added_newlines(buf)
}
fn serialize_attrs_to_buf(&self, buf: &mut String) -> Result<(), std::fmt::Error> {
let mut attrs = Attributes::default();
for attr in self.attrs.iter() {
attrs.push(attr);
}
for (name, value) in attrs.iter() {
write!(buf, " {}", name)?;
if !value.is_empty() {
write!(buf, "=\"{}\"", encode_double_quoted_attribute(value))?;
}
}
Ok(())
}
}
/// The tag of an HTML element.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[allow(missing_docs)]
pub enum ElementTag {
Html,
Head,
Meta,
Body,
Div,
H1,
H2,
H3,
H4,
H5,
H6,
P,
Ol,
Ul,
Li,
Link,
Blockquote,
Pre,
Em,
Strong,
Del,
A,
Img,
Table,
Title,
Th,
Tr,
Td,
Br,
Hr,
Code,
Span,
Style,
}
impl ElementTag {
/// Name of the tag.
pub fn name(&self) -> &str {
match self {
Self::Html => "html",
Self::Head => "head",
Self::Meta => "meta",
Self::Body => "body",
Self::Div => "div",
Self::H1 => "h1",
Self::H2 => "h2",
Self::H3 => "h3",
Self::H4 => "h4",
Self::H5 => "h5",
Self::H6 => "h6",
Self::P => "p",
Self::Ol => "ol",
Self::Ul => "ul",
Self::Li => "li",
Self::Link => "link",
Self::Blockquote => "blockquote",
Self::Pre => "pre",
Self::Em => "em",
Self::Strong => "strong",
Self::Del => "del",
Self::A => "a",
Self::Img => "img",
Self::Table => "table",
Self::Th => "th",
Self::Title => "title",
Self::Tr => "tr",
Self::Td => "td",
Self::Br => "br",
Self::Hr => "hr",
Self::Code => "code",
Self::Span => "span",
Self::Style => "style",
}
}
fn is_block(&self) -> bool {
matches!(
self,
Self::Html
| Self::Head
| Self::Meta
| Self::Body
| Self::Div
| Self::H1
| Self::H2
| Self::H3
| Self::H4
| Self::H5
| Self::H6
| Self::P
| Self::Ol
| Self::Ul
| Self::Li
| Self::Blockquote
| Self::Table
| Self::Th
| Self::Tr
| Self::Br
| Self::Hr
)
}
}
#[derive(Debug, Default, Clone)]
struct Attributes {
attrs: HashMap,
}
impl Attributes {
fn push(&mut self, attr: &Attribute) {
if let Some(new_value) = attr.value() {
if let Some(old_value) = self.attrs.get_mut(attr.name()) {
assert!(!old_value.is_empty());
old_value.push(' ');
old_value.push_str(new_value);
} else {
self.attrs.insert(attr.name().into(), new_value.into());
}
} else {
assert!(!self.attrs.contains_key(attr.name()));
self.attrs.insert(attr.name().into(), "".into());
}
}
fn iter(&self) -> impl Iterator- {
self.attrs.iter()
}
}
/// An attribute of an HTML element.
#[derive(Clone, Debug)]
pub struct Attribute {
name: String,
value: Option,
}
impl Attribute {
/// Create a new element attribute.
pub fn new(name: &str, value: &str) -> Self {
Self {
name: name.into(),
value: Some(value.into()),
}
}
/// Return the name of the attribute.
pub fn name(&self) -> &str {
&self.name
}
/// Return the value of the attribute, if any.
pub fn value(&self) -> Option<&str> {
self.value.as_deref()
}
}
impl From for Attribute {
fn from(block_attr: BlockAttr) -> Self {
match block_attr {
BlockAttr::Id(v) => Self::new("id", &v),
BlockAttr::Class(v) => Self::new("class", &v),
BlockAttr::KeyValue(k, v) => Self::new(&k, &v),
}
}
}
/// Content in HTML.
#[derive(Clone, Debug)]
pub enum Content {
/// Arbitrary text.
Text(String),
/// An HTML element.
Elt(Element),
/// Arbitrary HTML text.
Html(String),
}
impl Content {
fn content(&self) -> String {
match self {
Self::Text(s) => s.clone(),
Self::Elt(e) => e.content(),
Self::Html(h) => h.clone(),
}
}
}
/// Location of element in source file.
#[derive(Debug, Clone, Eq, Serialize, Deserialize, PartialEq)]
#[serde(untagged)]
pub enum Location {
/// A known location.
Known {
/// Name of file.
filename: PathBuf,
/// Line in file.
line: usize,
/// Column in line.
col: usize,
},
/// An unknown location.
Unknown,
}
impl Location {
/// Create a new location.
pub fn new(filename: &Path, line: usize, col: usize) -> Self {
Self::Known {
filename: filename.into(),
line,
col,
}
}
/// Create an unknown location.
pub fn unknown() -> Self {
Self::Unknown
}
/// Report name of source file from where this element comes from.
pub fn filename(&self) -> &Path {
if let Self::Known {
filename,
line: _,
col: _,
} = self
{
filename
} else {
Path::new("")
}
}
/// Report row and column in source where this element comes from.
pub fn rowcol(&self) -> (usize, usize) {
if let Self::Known {
filename: _,
line,
col,
} = self
{
(*line, *col)
} else {
(0, 0)
}
}
}
impl std::fmt::Display for Location {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
if let Self::Known {
filename,
line,
col,
} = self
{
write!(f, "{}:{}:{}", filename.display(), line, col)
} else {
write!(f, "(unknown location)")
}
}
}
struct Stack {
stack: Vec,
}
impl Stack {
fn new() -> Self {
Self { stack: vec![] }
}
fn is_empty(&self) -> bool {
self.stack.is_empty()
}
fn push(&mut self, e: Element) {
trace!("pushed {:?}", e);
self.stack.push(e);
}
fn push_tag(&mut self, tag: ElementTag, loc: Location) {
self.push(Element::new(tag).with_location(loc));
}
fn pop(&mut self) -> Element {
let e = self.stack.pop().unwrap();
trace!("popped {:?}", e);
e
}
fn append_child(&mut self, child: Content) {
trace!("appended {:?}", child);
let mut parent = self.stack.pop().unwrap();
parent.push_child(child);
self.stack.push(parent);
}
fn append_str(&mut self, text: &str) {
self.append_child(Content::Text(text.into()));
}
fn append_element(&mut self, e: Element) {
self.append_child(Content::Elt(e));
}
}
/// Errors from the `html` module.
#[derive(Debug, thiserror::Error)]
pub enum HtmlError {
/// Failed to create a directory.
#[error("failed to create directory {0}")]
CreateDir(PathBuf, #[source] std::io::Error),
/// Failed to create a file.
#[error("failed to create file {0}")]
CreateFile(PathBuf, #[source] std::io::Error),
/// Failed to write to a file.
#[error("failed to write to file {0}")]
FileWrite(PathBuf, #[source] std::io::Error),
/// Input contains an attempt to use a definition list in
/// Markdown.
#[error("{0}: attempt to use definition lists in Markdown")]
DefinitionList(Location),
/// String formatting error. This is likely a programming error.
#[error("string formatting error: {0}")]
Format(#[source] std::fmt::Error),
}
/// Code block attribute.
#[derive(Debug, Clone, Eq, PartialEq)]
pub enum BlockAttr {
/// An identifier.
Id(String),
/// A class.
Class(String),
/// A key/value pair.
KeyValue(String, String),
}
impl BlockAttr {
fn id(s: &str) -> Self {
Self::Id(s.into())
}
fn class(s: &str) -> Self {
Self::Class(s.into())
}
fn key_value(k: &str, v: &str) -> Self {
Self::KeyValue(k.into(), v.into())
}
/// Parse a fenced code block tag.
pub fn parse(attrs: &str) -> Vec {
let mut result = vec![];
for word in Self::parse_words(attrs) {
let attr = Self::parse_word(word);
result.push(attr);
}
result
}
fn parse_words(attrs: &str) -> impl Iterator
- {
if attrs.starts_with('{') && attrs.ends_with('}') {
attrs[1..attrs.len() - 1].split_ascii_whitespace()
} else {
attrs.split_ascii_whitespace()
}
}
fn parse_word(word: &str) -> Self {
if let Some(id) = word.strip_prefix('#') {
Self::id(id)
} else if let Some(class) = word.strip_prefix('.') {
Self::class(class)
} else if let Some((key, value)) = word.split_once('=') {
Self::key_value(key, value)
} else {
Self::class(word)
}
}
}
#[cfg(test)]
mod test_block_attr {
use super::BlockAttr;
#[test]
fn empty_string() {
assert_eq!(BlockAttr::parse(""), vec![]);
}
#[test]
fn plain_word() {
assert_eq!(
BlockAttr::parse("foo"),
vec![BlockAttr::Class("foo".into())]
);
}
#[test]
fn dot_word() {
assert_eq!(
BlockAttr::parse(".foo"),
vec![BlockAttr::Class("foo".into())]
);
}
#[test]
fn hash_word() {
assert_eq!(BlockAttr::parse("#foo"), vec![BlockAttr::Id("foo".into())]);
}
#[test]
fn key_value() {
assert_eq!(
BlockAttr::parse("foo=bar"),
vec![BlockAttr::KeyValue("foo".into(), "bar".into())]
);
}
#[test]
fn several() {
assert_eq!(
BlockAttr::parse("{#foo .bar foobar yo=yoyo}"),
vec![
BlockAttr::Id("foo".into()),
BlockAttr::Class("bar".into()),
BlockAttr::Class("foobar".into()),
BlockAttr::KeyValue("yo".into(), "yoyo".into()),
]
);
}
}
#[derive(Debug, Default)]
struct Slugs {
slugs: HashSet,
}
impl Slugs {
const MAX: usize = 8;
fn remember(&mut self, slug: &str) {
self.slugs.insert(slug.into());
}
fn unique(&mut self, candidate: &str) -> String {
let slug = self.helper(candidate);
self.remember(&slug);
slug
}
fn helper(&mut self, candidate: &str) -> String {
let mut slug0 = String::new();
for c in candidate.chars() {
if slug0.len() >= Self::MAX {
break;
}
slug0.push(c);
}
if !self.slugs.contains(&slug0) {
return slug0.to_string();
}
let mut i = 0;
loop {
i += 1;
let slug = format!("{}{}", slug0, i);
if !self.slugs.contains(&slug) {
return slug;
}
}
}
}