From 7f71c6e1101f82b363ea6b2dddec30f1d5b0b9ef Mon Sep 17 00:00:00 2001 From: Eric Huss Date: Tue, 24 Feb 2026 12:36:12 -0800 Subject: [PATCH] Rewrite Charset so that it uses Expression This removes the `Characters` type, and instead uses `Expression`. There was a lot of overlap with `Expression`, and this simplifies things a little bit and removes some duplication. This also helps with some future changes I am working on that do coverage analysis, and this makes it easier to track coverage data for the different character values. The downside is that without the enum it isn't clear from the code what limitations there are with `ExpressionKind::Charset`, or that `ExpressionKind::CharacterRange` should only be in a `Charset`. These are implicit based on how the parser works. I'm willing to make that tradeoff. The `Characters` map as: - `Characters::Named` to `ExpressionKind::Nt` - `Characters::Terminal` to `ExpressionKind::Terminal` - `Characters::Range` to the NEW `ExpressionKind::CharacterRange` This incidentally fixes a small oversight where `Characters::Named` was missing span with the `grammar-text` class. --- tools/grammar/src/lib.rs | 35 ++++---- tools/grammar/src/parser.rs | 42 +++++----- .../src/grammar/render_markdown.rs | 81 ++++++++----------- .../src/grammar/render_railroad.rs | 62 +++++++------- 4 files changed, 100 insertions(+), 120 deletions(-) diff --git a/tools/grammar/src/lib.rs b/tools/grammar/src/lib.rs index 1d64e45143..b73aeac8d8 100644 --- a/tools/grammar/src/lib.rs +++ b/tools/grammar/src/lib.rs @@ -81,7 +81,14 @@ pub enum ExpressionKind { /// `// Single line comment.` Comment(String), /// ``[`A`-`Z` `_` LF]`` - Charset(Vec), + /// + /// This should only contain expressions that are valid inside brackets + /// (`Terminal`, `Nt`, and `CharacterRange`). + Charset(Vec), + /// `` `A`-`Z` `` used in a character set. + /// + /// This should only appear inside a `Charset`. + CharacterRange(Character, Character), /// ``~[` ` LF]`` NegExpression(Box), /// `^ A B C` @@ -108,16 +115,6 @@ impl Display for RangeLimit { } } -#[derive(Clone, Debug)] -pub enum Characters { - /// `LF` - Named(String), - /// `` `_` `` - Terminal(String), - /// `` `A`-`Z` `` - Range(Character, Character), -} - #[derive(Clone, Debug)] pub enum Character { Char(char), @@ -176,11 +173,14 @@ impl Expression { | ExpressionKind::Cut(e) => { e.visit_nt(callback); } - ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => { + ExpressionKind::Alt(es) + | ExpressionKind::Sequence(es) + | ExpressionKind::Charset(es) => { for e in es { e.visit_nt(callback); } } + ExpressionKind::Nt(nt) => { callback(&nt); } @@ -188,15 +188,8 @@ impl Expression { | ExpressionKind::Prose(_) | ExpressionKind::Break(_) | ExpressionKind::Comment(_) - | ExpressionKind::Unicode(_) => {} - ExpressionKind::Charset(set) => { - for ch in set { - match ch { - Characters::Named(s) => callback(s), - Characters::Terminal(_) | Characters::Range(_, _) => {} - } - } - } + | ExpressionKind::Unicode(_) + | ExpressionKind::CharacterRange(..) => {} } } diff --git a/tools/grammar/src/parser.rs b/tools/grammar/src/parser.rs index 828af0fb7c..4b363f6f8c 100644 --- a/tools/grammar/src/parser.rs +++ b/tools/grammar/src/parser.rs @@ -1,6 +1,6 @@ //! A parser of the ENBF-like grammar. -use super::{Character, Characters, Expression, ExpressionKind, Grammar, Production, RangeLimit}; +use super::{Character, Expression, ExpressionKind, Grammar, Production, RangeLimit}; use std::fmt; use std::fmt::Display; use std::path::Path; @@ -309,7 +309,7 @@ impl Parser<'_> { let Some(ch) = self.parse_characters()? else { break; }; - characters.push(ch); + characters.push(Expression::new_kind(ch)); } if characters.is_empty() { bail!(self, "expected at least one character in character group"); @@ -321,24 +321,24 @@ impl Parser<'_> { /// Parse an element of a character class, e.g. /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``. - fn parse_characters(&mut self) -> Result> { + fn parse_characters(&mut self) -> Result> { if let Some(a) = self.parse_character()? { if self.take_str("-") { let Some(b) = self.parse_character()? else { bail!(self, "expected character in range"); }; - Ok(Some(Characters::Range(a, b))) + Ok(Some(ExpressionKind::CharacterRange(a, b))) } else { //~^ Parse terminal in backticks. let t = match a { Character::Char(ch) => ch.to_string(), Character::Unicode(_) => bail!(self, "unicode not supported"), }; - Ok(Some(Characters::Terminal(t))) + Ok(Some(ExpressionKind::Terminal(t))) } } else if let Some(name) = self.parse_name() { //~^ Parse nonterminal identifier. - Ok(Some(Characters::Named(name))) + Ok(Some(ExpressionKind::Nt(name))) } else { Ok(None) } @@ -573,7 +573,7 @@ fn translate_position(input: &str, index: usize) -> (&str, usize, usize) { #[cfg(test)] mod tests { use crate::parser::{parse_grammar, translate_position}; - use crate::{Character, Characters, ExpressionKind, Grammar, RangeLimit}; + use crate::{Character, ExpressionKind, Grammar, RangeLimit}; use std::path::Path; #[test] @@ -824,8 +824,8 @@ mod tests { panic!("expected Charset inside lookahead, got {:?}", inner.kind); }; assert_eq!(chars.len(), 2); - assert!(matches!(&chars[0], Characters::Terminal(t) if t == "e")); - assert!(matches!(&chars[1], Characters::Terminal(t) if t == "E")); + assert!(matches!(&chars[0].kind, ExpressionKind::Terminal(t) if t == "e")); + assert!(matches!(&chars[1].kind, ExpressionKind::Terminal(t) if t == "E")); } #[test] @@ -977,7 +977,7 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); @@ -996,7 +996,7 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Char(ch) if *ch == 'a')); @@ -1012,7 +1012,7 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Char(ch) if *ch == 'a')); @@ -1031,12 +1031,12 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 2); - let Characters::Range(a1, b1) = &chars[0] else { + let ExpressionKind::CharacterRange(a1, b1) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a1, Character::Unicode((ch, _)) if *ch == '\0')); assert!(matches!(b1, Character::Unicode((ch, _)) if *ch == '\u{D7FF}')); - let Characters::Range(a2, b2) = &chars[1] else { + let ExpressionKind::CharacterRange(a2, b2) = &chars[1].kind else { panic!("expected Range, got {:?}", chars[1]); }; assert!(matches!(a2, Character::Unicode((ch, _)) if *ch == '\u{E000}')); @@ -1052,9 +1052,9 @@ mod tests { panic!("expected Charset, got {:?}", rule.expression.kind); }; assert_eq!(chars.len(), 3); - assert!(matches!(&chars[0], Characters::Terminal(t) if t == "a")); - assert!(matches!(&chars[1], Characters::Terminal(t) if t == "b")); - assert!(matches!(&chars[2], Characters::Named(n) if n == "Foo")); + assert!(matches!(&chars[0].kind, ExpressionKind::Terminal(t) if t == "a")); + assert!(matches!(&chars[1].kind, ExpressionKind::Terminal(t) if t == "b")); + assert!(matches!(&chars[2].kind, ExpressionKind::Nt(n) if n == "Foo")); } // --- Negative lookahead combined with charset --- @@ -1076,9 +1076,9 @@ mod tests { panic!("expected Charset, got {:?}", inner.kind); }; assert_eq!(chars.len(), 3); - assert!(matches!(&chars[0], Characters::Terminal(t) if t == "x")); - assert!(matches!(&chars[1], Characters::Terminal(t) if t == "y")); - assert!(matches!(&chars[2], Characters::Named(n) if n == "LF")); + assert!(matches!(&chars[0].kind, ExpressionKind::Terminal(t) if t == "x")); + assert!(matches!(&chars[1].kind, ExpressionKind::Terminal(t) if t == "y")); + assert!(matches!(&chars[2].kind, ExpressionKind::Nt(n) if n == "LF")); } // --- Negative lookahead combined with Unicode --- @@ -1098,7 +1098,7 @@ mod tests { panic!("expected Charset, got {:?}", inner.kind); }; assert_eq!(chars.len(), 1); - let Characters::Range(a, b) = &chars[0] else { + let ExpressionKind::CharacterRange(a, b) = &chars[0].kind else { panic!("expected Range, got {:?}", chars[0]); }; assert!(matches!(a, Character::Unicode((ch, _)) if *ch == '\0')); diff --git a/tools/mdbook-spec/src/grammar/render_markdown.rs b/tools/mdbook-spec/src/grammar/render_markdown.rs index 316eb9aaf3..42475bf8ce 100644 --- a/tools/mdbook-spec/src/grammar/render_markdown.rs +++ b/tools/mdbook-spec/src/grammar/render_markdown.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Character, Characters, Expression, ExpressionKind, Production}; +use grammar::{Character, Expression, ExpressionKind, Production}; use regex::Regex; use std::borrow::Cow; use std::fmt::Write; @@ -79,6 +79,7 @@ fn last_expr(expr: &Expression) -> &ExpressionKind { | ExpressionKind::Break(_) | ExpressionKind::Comment(_) | ExpressionKind::Charset(_) + | ExpressionKind::CharacterRange(..) | ExpressionKind::NegExpression(_) | ExpressionKind::Cut(_) | ExpressionKind::Unicode(_) => &expr.kind, @@ -178,6 +179,20 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { write!(output, "// {s}").unwrap(); } ExpressionKind::Charset(set) => charset_render_markdown(cx, set, output), + ExpressionKind::CharacterRange(start, end) => { + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => write!( + output, + "{}", + markdown_escape(&ch.to_string()) + ) + .unwrap(), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(start, output); + output.push('-'); + write_ch(end, output); + } ExpressionKind::NegExpression(e) => { output.push('~'); render_expression(e, cx, output); @@ -203,11 +218,11 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, output: &mut String) { } } -fn charset_render_markdown(cx: &RenderCtx, set: &[Characters], output: &mut String) { +fn charset_render_markdown(cx: &RenderCtx, set: &[Expression], output: &mut String) { output.push_str("\\["); let mut iter = set.iter().peekable(); - while let Some(chars) = iter.next() { - render_characters(chars, cx, output); + while let Some(expr) = iter.next() { + render_expression(expr, cx, output); if iter.peek().is_some() { output.push(' '); } @@ -215,35 +230,6 @@ fn charset_render_markdown(cx: &RenderCtx, set: &[Characters], output: &mut Stri output.push(']'); } -fn render_characters(chars: &Characters, cx: &RenderCtx, output: &mut String) { - match chars { - Characters::Named(s) => { - let dest = cx.md_link_map.get(s).map_or("missing", |d| d.as_str()); - write!(output, "[{s}]({dest})").unwrap(); - } - Characters::Terminal(s) => write!( - output, - "{}", - markdown_escape(s) - ) - .unwrap(), - Characters::Range(a, b) => { - let write_ch = |ch: &Character, output: &mut String| match ch { - Character::Char(ch) => write!( - output, - "{}", - markdown_escape(&ch.to_string()) - ) - .unwrap(), - Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), - }; - write_ch(a, output); - output.push('-'); - write_ch(b, output); - } - } -} - /// Escapes characters that markdown would otherwise interpret. fn markdown_escape(s: &str) -> Cow<'_, str> { static ESC_RE: LazyLock = @@ -304,8 +290,8 @@ mod tests { fn lookahead_charset() { let result = render(ExpressionKind::NegativeLookahead(Box::new( Expression::new_kind(ExpressionKind::Charset(vec![ - Characters::Terminal("e".to_string()), - Characters::Terminal("E".to_string()), + Expression::new_kind(ExpressionKind::Terminal("e".to_string())), + Expression::new_kind(ExpressionKind::Terminal("E".to_string())), ])), ))); assert!(result.starts_with("!"), "should start with `!`"); @@ -351,9 +337,11 @@ mod tests { #[test] fn charset_unicode_range() { - let result = render(ExpressionKind::Charset(vec![Characters::Range( - Character::Unicode(('\0', "0000".to_string())), - Character::Unicode(('\u{007F}', "007F".to_string())), + let result = render(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + ), )])); assert!(result.contains("\\[")); assert!(result.contains("U+0000")); @@ -363,9 +351,8 @@ mod tests { #[test] fn charset_char_range() { - let result = render(ExpressionKind::Charset(vec![Characters::Range( - Character::Char('a'), - Character::Char('z'), + let result = render(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange(Character::Char('a'), Character::Char('z')), )])); assert!(result.contains("\\[")); assert!(result.contains("grammar-literal")); @@ -375,9 +362,11 @@ mod tests { #[test] fn charset_mixed_range() { // [`a`-U+007A] - let result = render(ExpressionKind::Charset(vec![Characters::Range( - Character::Char('a'), - Character::Unicode(('\u{007A}', "007A".to_string())), + let result = render(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange( + Character::Char('a'), + Character::Unicode(('\u{007A}', "007A".to_string())), + ), )])); assert!(result.contains("grammar-literal")); assert!(result.contains("U+007A")); @@ -400,8 +389,8 @@ mod tests { #[test] fn neg_expression_rendering() { let result = render(ExpressionKind::NegExpression(Box::new( - Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( - "a".to_string(), + Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::Terminal("a".to_string()), )])), ))); assert!( diff --git a/tools/mdbook-spec/src/grammar/render_railroad.rs b/tools/mdbook-spec/src/grammar/render_railroad.rs index ad7b291e57..df4933353a 100644 --- a/tools/mdbook-spec/src/grammar/render_railroad.rs +++ b/tools/mdbook-spec/src/grammar/render_railroad.rs @@ -3,7 +3,7 @@ use super::RenderCtx; use crate::grammar::Grammar; use anyhow::bail; -use grammar::{Character, Characters, Expression, ExpressionKind, Production, RangeLimit}; +use grammar::{Character, Expression, ExpressionKind, Production, RangeLimit}; use railroad::*; use regex::Regex; use std::fmt::Write; @@ -285,8 +285,23 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option return None, ExpressionKind::Comment(_) => return None, ExpressionKind::Charset(set) => { - let ns: Vec<_> = set.iter().map(|c| render_characters(c, cx)).collect(); - Box::new(Choice::>::new(ns)) + let choices: Vec<_> = set + .iter() + .map(|e| render_expression(e, cx, stack)) + .filter_map(|n| n) + .collect(); + Box::new(Choice::>::new(choices)) + } + ExpressionKind::CharacterRange(start, end) => { + let mut s = String::new(); + let write_ch = |ch: &Character, output: &mut String| match ch { + Character::Char(ch) => output.push(*ch), + Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), + }; + write_ch(start, &mut s); + s.push('-'); + write_ch(end, &mut s); + Box::new(Terminal::new(s)) } ExpressionKind::NegExpression(e) => { let n = render_expression(e, cx, stack)?; @@ -313,24 +328,6 @@ fn render_expression(expr: &Expression, cx: &RenderCtx, stack: bool) -> Option Box { - match chars { - Characters::Named(s) => node_for_nt(cx, s), - Characters::Terminal(s) => Box::new(Terminal::new(s.clone())), - Characters::Range(a, b) => { - let mut s = String::new(); - let write_ch = |ch: &Character, output: &mut String| match ch { - Character::Char(ch) => output.push(*ch), - Character::Unicode((_, s)) => write!(output, "U+{s}").unwrap(), - }; - write_ch(a, &mut s); - s.push('-'); - write_ch(b, &mut s); - Box::new(Terminal::new(s)) - } - } -} - fn node_for_nt(cx: &RenderCtx, name: &str) -> Box { let dest = cx .rr_link_map @@ -391,7 +388,7 @@ impl Node for Except { #[cfg(test)] mod tests { use super::*; - use grammar::{Character, Characters, Expression, ExpressionKind, RangeLimit}; + use grammar::{Character, Expression, ExpressionKind, RangeLimit}; /// Render an expression to an SVG string fragment. fn render_to_svg(expr: &Expression) -> Option { @@ -508,8 +505,8 @@ mod tests { fn lookahead_charset() { let expr = Expression::new_kind(ExpressionKind::NegativeLookahead(Box::new( Expression::new_kind(ExpressionKind::Charset(vec![ - Characters::Terminal("e".to_string()), - Characters::Terminal("E".to_string()), + Expression::new_kind(ExpressionKind::Terminal("e".to_string())), + Expression::new_kind(ExpressionKind::Terminal("E".to_string())), ])), ))); let svg = render_to_svg(&expr).unwrap(); @@ -541,9 +538,11 @@ mod tests { #[test] fn charset_unicode_range() { - let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( - Character::Unicode(('\0', "0000".to_string())), - Character::Unicode(('\u{007F}', "007F".to_string())), + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange( + Character::Unicode(('\0', "0000".to_string())), + Character::Unicode(('\u{007F}', "007F".to_string())), + ), )])); let svg = render_to_svg(&expr).unwrap(); assert!(svg.contains("U+0000")); @@ -552,9 +551,8 @@ mod tests { #[test] fn charset_char_range() { - let expr = Expression::new_kind(ExpressionKind::Charset(vec![Characters::Range( - Character::Char('a'), - Character::Char('z'), + let expr = Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::CharacterRange(Character::Char('a'), Character::Char('z')), )])); let svg = render_to_svg(&expr).unwrap(); assert!(svg.contains("a")); @@ -581,8 +579,8 @@ mod tests { #[test] fn neg_expression_rendering() { let expr = Expression::new_kind(ExpressionKind::NegExpression(Box::new( - Expression::new_kind(ExpressionKind::Charset(vec![Characters::Terminal( - "a".to_string(), + Expression::new_kind(ExpressionKind::Charset(vec![Expression::new_kind( + ExpressionKind::Terminal("a".to_string()), )])), ))); let svg = render_to_svg(&expr).unwrap();