use log::trace; use regex::Regex; #[derive(Debug, Clone, Eq, PartialEq)] pub struct Token { pub token: TokenKind, pub pos: usize, } impl Token { fn new(token: TokenKind, pos: usize) -> Self { trace!("Token: token={:?} pos={}", token, pos); Self { token, pos } } } #[derive(Debug, Clone, Eq, PartialEq)] pub enum TokenKind { End, Markdown(String), OpenParens, ClosedParens, OpenBracket, ClosedBracket, OpenBrackets, CloseBrackets, Bang, Pipe, Word(String), PageName(String), Spaces(String), Equals, QuotedValue(String), } #[derive(Debug, Clone)] pub struct TokenPatterns { plain: Regex, word: Regex, spaces: Regex, single_quoted: Regex, single_quoted2: Regex, triple_quoted: Regex, triple_quoted2: Regex, } impl Default for TokenPatterns { fn default() -> Self { Self { plain: Regex::new(r#"(?P[^|\[]+)"#).unwrap(), word: Regex::new(r#"[-.:_/[:alpha:][:digit:]]+"#).unwrap(), spaces: Regex::new(r#"([[:space:]]|\n)+"#).unwrap(), single_quoted: Regex::new(r#""(?P(.|\n)*?)""#).unwrap(), single_quoted2: Regex::new(r#"'(?P(.|\n)*?)'"#).unwrap(), triple_quoted: Regex::new(r#""""(?P(.|\n)*?)""""#).unwrap(), triple_quoted2: Regex::new(r#"'''(?P(.|\n)*?)'''"#).unwrap(), } } } #[derive(Debug, Clone)] pub struct TokenParser<'a> { pos: usize, input: &'a str, patterns: &'a TokenPatterns, } impl<'a> TokenParser<'a> { pub fn new(input: &'a str, patterns: &'a TokenPatterns) -> Self { Self { pos: 0, input, patterns, } } pub fn parse(&mut self) -> Token { let pos = self.pos; let token = if self.input.is_empty() { TokenKind::End } else if self.literal("(") { TokenKind::OpenParens } else if self.literal(")") { TokenKind::ClosedParens } else if self.literal("[[") { TokenKind::OpenBrackets } else if self.literal("]]") { TokenKind::CloseBrackets } else if self.literal("[") { TokenKind::OpenBracket } else if self.literal("]") { TokenKind::ClosedBracket } else if self.literal("!") { TokenKind::Bang } else if self.literal("|") { TokenKind::Pipe } else if self.literal("=") { TokenKind::Equals } else if let Some(m) = self.regex(&self.patterns.spaces.clone()) { TokenKind::Spaces(m.as_str().into()) } else if let Some(m) = self.regex(&self.patterns.triple_quoted.clone()) { TokenKind::QuotedValue(m.as_str().into()) } else if let Some(m) = self.regex(&self.patterns.triple_quoted2.clone()) { TokenKind::QuotedValue(m.as_str().into()) } else if let Some(m) = self.regex(&self.patterns.single_quoted.clone()) { TokenKind::QuotedValue(m.as_str().into()) } else if let Some(m) = self.regex(&self.patterns.single_quoted2.clone()) { TokenKind::QuotedValue(m.as_str().into()) } else if let Some(m) = self.regex(&self.patterns.word.clone()) { TokenKind::Word(m.as_str().into()) } else if let Some(m) = self.regex(&self.patterns.plain.clone()) { TokenKind::Markdown(m.as_str().into()) } else { panic!("can't handle input: {:?}", self.input); }; Token::new(token, pos) } fn literal(&mut self, pattern: &str) -> bool { if let Some(rest) = self.input.strip_prefix(pattern) { self.pos += pattern.len(); self.input = rest; true } else { false } } fn regex(&mut self, pattern: &Regex) -> Option { // trace!("matching regex {}", pattern.as_str()); if let Some(m) = pattern.find(self.input) { if m.start() == 0 { // trace!("match at beginning"); let captures = pattern.captures(self.input).unwrap(); let m = if let Some(value) = captures.name("value") { self.input = &self.input[m.end()..]; value } else { self.input = &self.input[m.end()..]; captures.get(0).unwrap() }; self.pos += m.end(); return Some(m.as_str().to_string()); } } // trace!("no match at beginning"); None } } #[cfg(test)] mod test { use super::{TokenKind, TokenParser, TokenPatterns}; fn parser<'a>(input: &'a str, patterns: &'a TokenPatterns) -> TokenParser<'a> { TokenParser::new(input, patterns) } #[test] fn empty_string() { let patterns = TokenPatterns::default(); let mut p = parser("", &patterns); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn plain_markdown() { let patterns = TokenPatterns::default(); let mut p = parser("** hello, world", &patterns); assert_eq!( p.parse().token, TokenKind::Markdown("** hello, world".into()) ); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn single_open_parens() { let patterns = TokenPatterns::default(); let mut p = parser("(", &patterns); assert_eq!(p.parse().token, TokenKind::OpenParens); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn single_close_parens() { let patterns = TokenPatterns::default(); let mut p = parser(")", &patterns); assert_eq!(p.parse().token, TokenKind::ClosedParens); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn single_open_bracket() { let patterns = TokenPatterns::default(); let mut p = parser("[", &patterns); assert_eq!(p.parse().token, TokenKind::OpenBracket); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn single_close_bracket() { let patterns = TokenPatterns::default(); let mut p = parser("]", &patterns); assert_eq!(p.parse().token, TokenKind::ClosedBracket); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn double_open_bracket() { let patterns = TokenPatterns::default(); let mut p = parser("[[", &patterns); assert_eq!(p.parse().token, TokenKind::OpenBrackets); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn double_close_bracket() { let patterns = TokenPatterns::default(); let mut p = parser("]]", &patterns); assert_eq!(p.parse().token, TokenKind::CloseBrackets); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn bang() { let patterns = TokenPatterns::default(); let mut p = parser("!", &patterns); assert_eq!(p.parse().token, TokenKind::Bang); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn pipe() { let patterns = TokenPatterns::default(); let mut p = parser("|", &patterns); assert_eq!(p.parse().token, TokenKind::Pipe); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn equals() { let patterns = TokenPatterns::default(); let mut p = parser("=", &patterns); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn simple_word() { let patterns = TokenPatterns::default(); let mut p = parser("foo bar", &patterns); assert_eq!(p.parse().token, TokenKind::Word("foo".into())); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("bar".into())); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn number_word() { let patterns = TokenPatterns::default(); let mut p = parser("123", &patterns); assert_eq!(p.parse().token, TokenKind::Word("123".into())); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn complex_word() { let patterns = TokenPatterns::default(); let mut p = parser("foo-1.2_3[[bar/subpage]]", &patterns); assert_eq!(p.parse().token, TokenKind::Word("foo-1.2_3".into())); assert_eq!(p.parse().token, TokenKind::OpenBrackets); assert_eq!(p.parse().token, TokenKind::Word("bar/subpage".into())); assert_eq!(p.parse().token, TokenKind::CloseBrackets); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn spaces() { let patterns = TokenPatterns::default(); let mut p = parser("\n", &patterns); assert_eq!(p.parse().token, TokenKind::Spaces("\n".into())); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn single_quoted() { let patterns = TokenPatterns::default(); let mut p = parser(r#""hello there""#, &patterns); assert_eq!( p.parse().token, TokenKind::QuotedValue(r#"hello there"#.into()) ); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn triple_quoted() { let patterns = TokenPatterns::default(); let mut p = parser(r#""""hello\nthere""""#, &patterns); assert_eq!( p.parse().token, TokenKind::QuotedValue(r#"hello\nthere"#.into()) ); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn simple_directive() { let patterns = TokenPatterns::default(); let mut p = parser(r#"[[!if test="enabled(sidebar)"]]"#, &patterns); assert_eq!(p.parse().token, TokenKind::OpenBrackets); assert_eq!(p.parse().token, TokenKind::Bang); assert_eq!(p.parse().token, TokenKind::Word("if".into())); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("test".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue(r#"enabled(sidebar)"#.into()) ); assert_eq!(p.parse().token, TokenKind::CloseBrackets); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn complex_directive() { let patterns = TokenPatterns::default(); let mut p = parser( r#"[[!if test="enabled(sidebar)" then=""" [[!sidebar]] """ else=""" [[!inline pages=sidebar raw=yes]] """]]"#, &patterns, ); assert_eq!(p.parse().token, TokenKind::OpenBrackets); assert_eq!(p.parse().token, TokenKind::Bang); assert_eq!(p.parse().token, TokenKind::Word("if".into())); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("test".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue(r#"enabled(sidebar)"#.into()) ); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("then".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue("\n[[!sidebar]]\n".into()) ); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("else".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue("\n[[!inline pages=sidebar raw=yes]]\n".into()) ); assert_eq!(p.parse().token, TokenKind::CloseBrackets); assert_eq!(p.parse().token, TokenKind::End); } #[test] fn complex_directive_with_lookahead() { let patterns = TokenPatterns::default(); let orig = parser( r#"[[!if test="enabled(sidebar)" then=""" [[!sidebar]] """ else=""" [[!inline pages=sidebar raw=yes]] """]]"#, &patterns, ); let mut p = orig.clone(); assert_eq!(p.parse().token, TokenKind::OpenBrackets); assert_eq!(p.parse().token, TokenKind::Bang); assert_eq!(p.parse().token, TokenKind::Word("if".into())); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("test".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue(r#"enabled(sidebar)"#.into()) ); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("then".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue("\n[[!sidebar]]\n".into()) ); assert_eq!(p.parse().token, TokenKind::Spaces(" ".into())); assert_eq!(p.parse().token, TokenKind::Word("else".into())); assert_eq!(p.parse().token, TokenKind::Equals); assert_eq!( p.parse().token, TokenKind::QuotedValue("\n[[!inline pages=sidebar raw=yes]]\n".into()) ); assert_eq!(p.parse().token, TokenKind::CloseBrackets); assert_eq!(p.parse().token, TokenKind::End); } }