diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 1d64e45143..b73aeac8d8 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -81,7 +81,14 @@ pub enum ExpressionKind { /// `// Single line comment.` Comment(String), /// ``[`A`-`Z` `_` LF]`` - Charset(Vec), + /// + /// This should only contain expressions that are valid inside brackets + /// (`Terminal`, `Nt`, and `CharacterRange`). + Charset(Vec), + /// `` `A`-`Z` `` used in a character set. + /// + /// This should only appear inside a `Charset`. + CharacterRange(Character, Character), /// ``~[` ` LF]`` NegExpression(Box), /// `^ A B C` @@ -108,16 +115,6 @@ impl Display for RangeLimit { } } -#[derive(Clone, Debug)] -pub enum Characters { - /// `LF` - Named(String), - /// `` `_` `` - Terminal(String), - /// `` `A`-`Z` `` - Range(Character, Character), -} - #[derive(Clone, Debug)] pub enum Character { Char(char), @@ -176,11 +173,14 @@ impl Expression { | ExpressionKind::Cut(e) => { e.visit_nt(callback); } - ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => { + ExpressionKind::Alt(es) + | ExpressionKind::Sequence(es) + | ExpressionKind::Charset(es) => { for e in es { e.visit_nt(callback); } } + ExpressionKind::Nt(nt) => { callback(&nt); } @@ -188,15 +188,8 @@ impl Expression { | ExpressionKind::Prose(_) | ExpressionKind::Break(_) | ExpressionKind::Comment(_) - | ExpressionKind::Unicode(_) => {} - ExpressionKind::Charset(set) => { - for ch in set { - match ch { - Characters::Named(s) => callback(s), - Characters::Terminal(_) | Characters::Range(_, _) => {} - } - } - } + | ExpressionKind::Unicode(_) + | ExpressionKind::CharacterRange(..) => {} } } diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index 828af0fb7c..4b363f6f8c 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -1,6 +1,6 @@ //! A parser of the ENBF-like grammar. -use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit}; +use super::{Character, Expression, ExpressionKind, Grammar, Production, RangeLimit}; use std::fmt; use std::fmt::Display; use std::path::Path; @@ -309,7 +309,7 @@ impl Parser<'_> { let Some(ch) = self.parse_characters()? else { break; }; - characters.push(ch); + characters.push(Expression::new_kind(ch)); } if characters.is_empty() { bail!(self, "expected at least one character in character group"); @@ -321,24 +321,24 @@ impl Parser<'_> { /// Parse an element of a character class, e.g. /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. - fn parse_characters(&mut self) -> Result> { + fn parse_characters(&mut self) -> Result> { if let Some(a) = self.parse_character()? { if self.take_str("-") { let Some(b) = self.parse_character()? else { bail!(self, "expected character in range"); }; - Ok(Some(Characters::Range(a, b))) + Ok(Some(ExpressionKind::CharacterRange(a, b))) } else { //~^ Parse terminal in backticks. let t = match a { Character::Char(ch) => ch.to_string(), Character::Unicode(_) => bail!(self, "unicode not supported"), }; - Ok(Some(Characters::Terminal(t))) + Ok(Some(ExpressionKind::Terminal(t))) } } else if let Some(name) = self.parse_name() { //~^ Parse nonterminal identifier. - Ok(Some(Characters::Named(name))) + Ok(Some(ExpressionKind::Nt(name))) } else { Ok(None) } @@ -573,7 +573,7 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) { #[cfg(test)] mod tests { use crate::parser::{parse_grammar, translate_position}; - use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit}; + use crate::{Character, ExpressionKind, Grammar, RangeLimit}; use std::path::Path; #[test] @@ -824,8 +824,8 @@ mod tests { panic!("expected Charset inside lookahead, got {:?}", inner.kind); }; assert_eq!(chars.len(), 2); - assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e")); - assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E")); + assert!(matches!(&chars[0].kind, ExpressionKind::Terminal(t) if t == "e")); + assert!(matches!(&chars[1].kind, ExpressionKind::Terminal(t) if t == "E")); } #[test] @@ -977,7 +977,7 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); @@ -996,7 +996,7 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Char(ch) if *ch == 'a')); @@ -1012,7 +1012,7 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Char(ch) if *ch == 'a')); @@ -1031,12 +1031,12 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 2); - let Characters::Range(a1, b1) = &chars[0] else { + let ExpressionKind::CharacterRange(a1, b1) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0')); assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}')); - let Characters::Range(a2, b2) = &chars[1] else { + let ExpressionKind::CharacterRange(a2, b2) = &chars[1].kind else { panic!("expected Range, got {:?}", chars[1]); }; assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}')); @@ -1052,9 +1052,9 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 3); - assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a")); - assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b")); - assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo")); + assert!(matches!(&chars[0].kind, ExpressionKind::Terminal(t) if t == "a")); + assert!(matches!(&chars[1].kind, ExpressionKind::Terminal(t) if t == "b")); + assert!(matches!(&chars[2].kind, ExpressionKind::Nt(n) if n == "Foo")); } // --- Negative lookahead combined with charset --- @@ -1076,9 +1076,9 @@ mod tests { panic!("expected Charset, got {:?}", inner.kind); }; assert_eq!(chars.len(), 3); - assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x")); - assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y")); - assert!(matches!(&chars[2], Characters::Named(n) if n == "LF")); + assert!(matches!(&chars[0].kind, ExpressionKind::Terminal(t) if t == "x")); + assert!(matches!(&chars[1].kind, ExpressionKind::Terminal(t) if t == "y")); + assert!(matches!(&chars[2].kind, ExpressionKind::Nt(n) if n == "LF")); } // --- Negative lookahead combined with Unicode --- @@ -1098,7 +1098,7 @@ mod tests { panic!("expected Charset, got {:?}", inner.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 316eb9aaf3..42475bf8ce 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Character, Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Expression, ExpressionKind, Production}; use regex::Regex; use std::borrow::Cow; use std::fmt::Write; @@ -79,6 +79,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { | ExpressionKind::Break(_) | ExpressionKind::Comment(_) | ExpressionKind::Charset(_) + | ExpressionKind::CharacterRange(..) | ExpressionKind::NegExpression(_) | ExpressionKind::Cut(_) | ExpressionKind::Unicode(_) => &expr.kind, @@ -178,6 +179,20 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { write!(output, "// {s}").unwrap(); } ExpressionKind::Charset(set) => charset_render_markdown(cx, set, output), + ExpressionKind::CharacterRange(start, end) => { + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => write!( + output, + "{}", + markdown_escape(&ch.to_string()) + ) + .unwrap(), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(start, output); + output.push('-'); + write_ch(end, output); + } ExpressionKind::NegExpression(e) => { output.push('~'); render_expression(e, cx, output); @@ -203,11 +218,11 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { } } -fn charset_render_markdown(cx: &RenderCtx, set: &[Characters], output: &mut String) { +fn charset_render_markdown(cx: &RenderCtx, set: &[Expression], output: &mut String) { output.push_str("\\["); let mut iter = set.iter().peekable(); - while let Some(chars) = iter.next() { - render_characters(chars, cx, output); + while let Some(expr) = iter.next() { + render_expression(expr, cx, output); if iter.peek().is_some() { output.push(' '); } @@ -215,35 +230,6 @@ fn charset_render_markdown(cx: &RenderCtx, set: &[Characters], output: &mut Stri output.push(']'); } -fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) { - match chars { - Characters::Named(s) => { - let dest = cx.md_link_map.get(s).map_or("missing", |d| d.as_str()); - write!(output, "[{s}]({dest})").unwrap(); - } - Characters::Terminal(s) => write!( - output, - "{}", - markdown_escape(s) - ) - .unwrap(), - Characters::Range(a, b) => { - let write_ch = |ch: &Character, output: &mut String| match ch { - Character::Char(ch) => write!( - output, - "{}", - markdown_escape(&ch.to_string()) - ) - .unwrap(), - Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), - }; - write_ch(a, output); - output.push('-'); - write_ch(b, output); - } - } -} - /// Escapes characters that markdown would otherwise interpret. fn markdown_escape(s: &str) -> Cow<'_, str> { static ESC_RE: LazyLock = @@ -304,8 +290,8 @@ mod tests { fn lookahead_charset() { let result = render(ExpressionKind::NegativeLookahead(Box::new( Expression::new_kind(ExpressionKind::Charset(vec![ - Characters::Terminal("e".to_string()), - Characters::Terminal("E".to_string()), + Expression::new_kind(ExpressionKind::Terminal("e".to_string())), + Expression::new_kind(ExpressionKind::Terminal("E".to_string())), ])), ))); assert!(result.starts_with("!"), "should start with `!`"); @@ -351,9 +337,11 @@ mod tests { #[test] fn charset_unicode_range() { - let result = render(ExpressionKind::Charset(vec![Characters::Range( - Character::Unicode(('\0', "0000".to_string())), - Character::Unicode(('\u{007F}', "007F".to_string())), + let result = render(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + ), )])); assert!(result.contains("\\[")); assert!(result.contains("U+0000")); @@ -363,9 +351,8 @@ mod tests { #[test] fn charset_char_range() { - let result = render(ExpressionKind::Charset(vec![Characters::Range( - Character::Char('a'), - Character::Char('z'), + let result = render(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange(Character::Char('a'), Character::Char('z')), )])); assert!(result.contains("\\[")); assert!(result.contains("grammar-literal")); @@ -375,9 +362,11 @@ mod tests { #[test] fn charset_mixed_range() { // [`a`-U+007A] - let result = render(ExpressionKind::Charset(vec![Characters::Range( - Character::Char('a'), - Character::Unicode(('\u{007A}', "007A".to_string())), + let result = render(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange( + Character::Char('a'), + Character::Unicode(('\u{007A}', "007A".to_string())), + ), )])); assert!(result.contains("grammar-literal")); assert!(result.contains("U+007A")); @@ -400,8 +389,8 @@ mod tests { #[test] fn neg_expression_rendering() { let result = render(ExpressionKind::NegExpression(Box::new( - Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( - "a".to_string(), + Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::Terminal("a".to_string()), )])), ))); assert!( diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index ad7b291e57..df4933353a 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Character, Characters, Expression, ExpressionKind, Production, RangeLimit}; +use grammar::{Character, Expression, ExpressionKind, Production, RangeLimit}; use railroad::*; use regex::Regex; use std::fmt::Write; @@ -285,8 +285,23 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option return None, ExpressionKind::Comment(_) => return None, ExpressionKind::Charset(set) => { - let ns: Vec<_> = set.iter().map(|c| render_characters(c, cx)).collect(); - Box::new(Choice::>::new(ns)) + let choices: Vec<_> = set + .iter() + .map(|e| render_expression(e, cx, stack)) + .filter_map(|n| n) + .collect(); + Box::new(Choice::>::new(choices)) + } + ExpressionKind::CharacterRange(start, end) => { + let mut s = String::new(); + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => output.push(*ch), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(start, &mut s); + s.push('-'); + write_ch(end, &mut s); + Box::new(Terminal::new(s)) } ExpressionKind::NegExpression(e) => { let n = render_expression(e, cx, stack)?; @@ -313,24 +328,6 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Box { - match chars { - Characters::Named(s) => node_for_nt(cx, s), - Characters::Terminal(s) => Box::new(Terminal::new(s.clone())), - Characters::Range(a, b) => { - let mut s = String::new(); - let write_ch = |ch: &Character, output: &mut String| match ch { - Character::Char(ch) => output.push(*ch), - Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), - }; - write_ch(a, &mut s); - s.push('-'); - write_ch(b, &mut s); - Box::new(Terminal::new(s)) - } - } -} - fn node_for_nt(cx: &RenderCtx, name: &str) -> Box { let dest = cx .rr_link_map @@ -391,7 +388,7 @@ impl Node for Except { #[cfg(test)] mod tests { use super::*; - use grammar::{Character, Characters, Expression, ExpressionKind, RangeLimit}; + use grammar::{Character, Expression, ExpressionKind, RangeLimit}; /// Render an expression to an SVG string fragment. fn render_to_svg(expr: &Expression) -> Option { @@ -508,8 +505,8 @@ mod tests { fn lookahead_charset() { let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( Expression::new_kind(ExpressionKind::Charset(vec![ - Characters::Terminal("e".to_string()), - Characters::Terminal("E".to_string()), + Expression::new_kind(ExpressionKind::Terminal("e".to_string())), + Expression::new_kind(ExpressionKind::Terminal("E".to_string())), ])), ))); let svg = render_to_svg(&expr).unwrap(); @@ -541,9 +538,11 @@ mod tests { #[test] fn charset_unicode_range() { - let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( - Character::Unicode(('\0', "0000".to_string())), - Character::Unicode(('\u{007F}', "007F".to_string())), + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + ), )])); let svg = render_to_svg(&expr).unwrap(); assert!(svg.contains("U+0000")); @@ -552,9 +551,8 @@ mod tests { #[test] fn charset_char_range() { - let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( - Character::Char('a'), - Character::Char('z'), + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange(Character::Char('a'), Character::Char('z')), )])); let svg = render_to_svg(&expr).unwrap(); assert!(svg.contains("a")); @@ -581,8 +579,8 @@ mod tests { #[test] fn neg_expression_rendering() { let expr = Expression::new_kind(ExpressionKind::NegExpression(Box::new( - Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( - "a".to_string(), + Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::Terminal("a".to_string()), )])), ))); let svg = render_to_svg(&expr).unwrap();