Skip to content

Commit 8513fd6

Browse files
committed
Move grammar loading to a dedicated crate
This will allow building tools that work on the grammar.
1 parent 6dc0944 commit 8513fd6

File tree

14 files changed

+660
-635
lines changed

14 files changed

+660
-635
lines changed

.github/workflows/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ jobs:
9494
rustc -Vv
9595
- name: Verify tools workspace lockfile is current
9696
run: cargo update -p mdbook-spec --locked
97-
- name: Test libraries
97+
- name: Test tools
9898
run: cargo test
9999

100100
preview:

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@ license = "MIT OR Apache-2.0"
1313

1414
[workspace.dependencies]
1515
diagnostics = { path = "tools/diagnostics" }
16+
grammar = { path = "tools/grammar" }
17+
pathdiff = "0.2.3"
18+
regex = "1.12.2"
19+
walkdir = "2.5.0"

book.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,4 @@ edition = "2024"
100100
command = "cargo run --release --manifest-path tools/mdbook-spec/Cargo.toml"
101101

102102
[build]
103-
extra-watch-dirs = ["tools/mdbook-spec/src"]
103+
extra-watch-dirs = ["tools/mdbook-spec/src", "tools/grammar/src"]

tools/diagnostics/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,3 @@ macro_rules! bug {
4949
std::process::exit(1);
5050
};
5151
}
52-

tools/grammar/Cargo.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "grammar"
3+
edition.workspace = true
4+
license.workspace = true
5+
6+
[dependencies]
7+
diagnostics.workspace = true
8+
pathdiff.workspace = true
9+
regex.workspace = true
10+
walkdir.workspace = true

tools/grammar/src/lib.rs

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
//! Support for loading the grammar.
2+
3+
use diagnostics::{Diagnostics, warn_or_err};
4+
use regex::Regex;
5+
use std::collections::{HashMap, HashSet};
6+
use std::path::{Path, PathBuf};
7+
use std::sync::LazyLock;
8+
use walkdir::WalkDir;
9+
10+
mod parser;
11+
12+
#[derive(Debug, Default)]
13+
pub struct Grammar {
14+
pub productions: HashMap<String, Production>,
15+
/// The order that the production names were discovered.
16+
pub name_order: Vec<String>,
17+
}
18+
19+
#[derive(Debug)]
20+
pub struct Production {
21+
pub name: String,
22+
/// Comments and breaks that precede the production name.
23+
pub comments: Vec<Expression>,
24+
/// Category is from the markdown lang string, and defines how it is
25+
/// grouped and organized on the summary page.
26+
pub category: String,
27+
pub expression: Expression,
28+
/// The path to the chapter where this is defined, relative to the book's
29+
/// `src` directory.
30+
pub path: PathBuf,
31+
pub is_root: bool,
32+
}
33+
34+
#[derive(Clone, Debug)]
35+
pub struct Expression {
36+
pub kind: ExpressionKind,
37+
/// Suffix is the `_foo_` part that is shown as a subscript.
38+
pub suffix: Option<String>,
39+
/// A footnote is a markdown footnote link.
40+
pub footnote: Option<String>,
41+
}
42+
43+
#[derive(Clone, Debug)]
44+
pub enum ExpressionKind {
45+
/// `( A B C )`
46+
Grouped(Box<Expression>),
47+
/// `A | B | C`
48+
Alt(Vec<Expression>),
49+
/// `A B C`
50+
Sequence(Vec<Expression>),
51+
/// `A?`
52+
Optional(Box<Expression>),
53+
/// `A*`
54+
Repeat(Box<Expression>),
55+
/// `A*?`
56+
RepeatNonGreedy(Box<Expression>),
57+
/// `A+`
58+
RepeatPlus(Box<Expression>),
59+
/// `A+?`
60+
RepeatPlusNonGreedy(Box<Expression>),
61+
/// `A{2..4}`
62+
RepeatRange(Box<Expression>, Option<u32>, Option<u32>),
63+
/// `NonTerminal`
64+
Nt(String),
65+
/// `` `string` ``
66+
Terminal(String),
67+
/// `<english description>`
68+
Prose(String),
69+
/// An LF followed by the given number of spaces.
70+
///
71+
/// Used by the renderer to help format and structure the grammar.
72+
Break(usize),
73+
/// `// Single line comment.`
74+
Comment(String),
75+
/// ``[`A`-`Z` `_` LF]``
76+
Charset(Vec<Characters>),
77+
/// ``~[` ` LF]``
78+
NegExpression(Box<Expression>),
79+
/// `U+0060`
80+
Unicode(String),
81+
}
82+
83+
#[derive(Clone, Debug)]
84+
pub enum Characters {
85+
/// `LF`
86+
Named(String),
87+
/// `` `_` ``
88+
Terminal(String),
89+
/// `` `A`-`Z` ``
90+
Range(char, char),
91+
}
92+
93+
impl Grammar {
94+
fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
95+
for p in self.productions.values() {
96+
p.expression.visit_nt(callback);
97+
}
98+
}
99+
}
100+
101+
impl Expression {
102+
pub fn new_kind(kind: ExpressionKind) -> Self {
103+
Self {
104+
kind,
105+
suffix: None,
106+
footnote: None,
107+
}
108+
}
109+
110+
fn visit_nt(&self, callback: &mut dyn FnMut(&str)) {
111+
match &self.kind {
112+
ExpressionKind::Grouped(e)
113+
| ExpressionKind::Optional(e)
114+
| ExpressionKind::Repeat(e)
115+
| ExpressionKind::RepeatNonGreedy(e)
116+
| ExpressionKind::RepeatPlus(e)
117+
| ExpressionKind::RepeatPlusNonGreedy(e)
118+
| ExpressionKind::RepeatRange(e, _, _)
119+
| ExpressionKind::NegExpression(e) => {
120+
e.visit_nt(callback);
121+
}
122+
ExpressionKind::Alt(es) | ExpressionKind::Sequence(es) => {
123+
for e in es {
124+
e.visit_nt(callback);
125+
}
126+
}
127+
ExpressionKind::Nt(nt) => {
128+
callback(&nt);
129+
}
130+
ExpressionKind::Terminal(_)
131+
| ExpressionKind::Prose(_)
132+
| ExpressionKind::Break(_)
133+
| ExpressionKind::Comment(_)
134+
| ExpressionKind::Unicode(_) => {}
135+
ExpressionKind::Charset(set) => {
136+
for ch in set {
137+
match ch {
138+
Characters::Named(s) => callback(s),
139+
Characters::Terminal(_) | Characters::Range(_, _) => {}
140+
}
141+
}
142+
}
143+
}
144+
}
145+
146+
pub fn is_break(&self) -> bool {
147+
matches!(self.kind, ExpressionKind::Break(_))
148+
}
149+
}
150+
151+
pub static GRAMMAR_RE: LazyLock<Regex> =
152+
LazyLock::new(|| Regex::new(r"(?ms)^```grammar,([^\n]+)\n(.*?)^```").unwrap());
153+
154+
/// Loads the [`Grammar`] from the book.
155+
pub fn load_grammar(diag: &mut Diagnostics) -> Grammar {
156+
let mut grammar = Grammar::default();
157+
let base = Path::new(env!("CARGO_MANIFEST_DIR")).join("../../src");
158+
for entry in WalkDir::new(&base) {
159+
let entry = entry.unwrap();
160+
let path = entry.path();
161+
if path.extension().and_then(|s| s.to_str()) != Some("md") {
162+
continue;
163+
}
164+
let content = std::fs::read_to_string(path).unwrap();
165+
let relative_path = pathdiff::diff_paths(path, &base).expect("one path must be absolute");
166+
for cap in GRAMMAR_RE.captures_iter(&content) {
167+
let category = &cap[1];
168+
let input = &cap[2];
169+
if let Err(e) = parser::parse_grammar(input, &mut grammar, category, &relative_path) {
170+
warn_or_err!(diag, "failed to parse grammar in {path:?}: {e}");
171+
}
172+
}
173+
}
174+
175+
check_undefined_nt(&grammar, diag);
176+
check_unexpected_roots(&grammar, diag);
177+
grammar
178+
}
179+
180+
/// Checks for nonterminals that are used but not defined.
181+
fn check_undefined_nt(grammar: &Grammar, diag: &mut Diagnostics) {
182+
grammar.visit_nt(&mut |nt| {
183+
if !grammar.productions.contains_key(nt) {
184+
warn_or_err!(diag, "non-terminal `{nt}` is used but not defined");
185+
}
186+
});
187+
}
188+
189+
/// This checks that all the grammar roots are what we expect.
190+
///
191+
/// This is intended to help catch any unexpected misspellings, orphaned
192+
/// productions, or general mistakes.
193+
fn check_unexpected_roots(grammar: &Grammar, diag: &mut Diagnostics) {
194+
// `set` starts with every production name.
195+
let mut set: HashSet<_> = grammar.name_order.iter().map(|s| s.as_str()).collect();
196+
fn remove(set: &mut HashSet<&str>, grammar: &Grammar, prod: &Production, root_name: &str) {
197+
prod.expression.visit_nt(&mut |nt| {
198+
// Leave the root name in the set if we find it recursively.
199+
if nt == root_name {
200+
return;
201+
}
202+
if !set.remove(nt) {
203+
return;
204+
}
205+
if let Some(nt_prod) = grammar.productions.get(nt) {
206+
remove(set, grammar, nt_prod, root_name);
207+
}
208+
});
209+
}
210+
// Walk the productions starting from the root nodes, and remove every
211+
// non-terminal from `set`. What's left must be the set of roots.
212+
grammar
213+
.productions
214+
.values()
215+
.filter(|prod| prod.is_root)
216+
.for_each(|root| {
217+
remove(&mut set, grammar, root, &root.name);
218+
});
219+
let expected: HashSet<_> = grammar
220+
.productions
221+
.values()
222+
.filter_map(|p| p.is_root.then(|| p.name.as_str()))
223+
.collect();
224+
if set != expected {
225+
let new: Vec<_> = set.difference(&expected).collect();
226+
let removed: Vec<_> = expected.difference(&set).collect();
227+
if !new.is_empty() {
228+
warn_or_err!(
229+
diag,
230+
"New grammar production detected that is not used in any root-accessible\n\
231+
production. If this is expected, mark the production with\n\
232+
`@root`. If not, make sure it is spelled correctly and used in\n\
233+
another root-accessible production.\n\
234+
\n\
235+
The new names are: {new:?}\n"
236+
);
237+
} else if !removed.is_empty() {
238+
warn_or_err!(
239+
diag,
240+
"Old grammar production root seems to have been removed\n\
241+
(it is used in some other production that is root-accessible).\n\
242+
If this is expected, remove `@root` from the production.\n\
243+
\n\
244+
The removed names are: {removed:?}\n"
245+
);
246+
} else {
247+
unreachable!("unexpected");
248+
}
249+
}
250+
}

tools/mdbook-spec/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ default-run = "mdbook-spec"
1111
[dependencies]
1212
anyhow = "1.0.79"
1313
diagnostics.workspace = true
14+
grammar.workspace = true
1415
mdbook-markdown = "0.5.1"
1516
mdbook-preprocessor = "0.5.1"
1617
once_cell = "1.19.0"
1718
pathdiff = "0.2.1"
1819
railroad = { version = "0.3.2", default-features = false }
19-
regex = "1.9.4"
20+
regex.workspace = true
2021
semver = "1.0.21"
2122
serde_json = "1.0.113"
2223
tempfile = "3.10.1"
23-
walkdir = "2.5.0"
24+
walkdir.workspace = true

0 commit comments

Comments
 (0)