onedrop_eval/evaluator/preprocess.rs
1//! Regex-driven preprocessing pipeline.
2//!
3//! [`MilkEvaluator::preprocess_expression`] is the entry point: it strips
4//! comments, normalises case, auto-initialises undefined variables, then
5//! chains the rewriter passes in [`super::rewriters`] and
6//! [`super::gmegabuf`] to massage MD2's EEL2 dialect into something
7//! `evalexpr` will accept.
8
9use crate::context::MilkContext;
10use regex::Regex;
11use std::sync::LazyLock;
12
13use super::MilkEvaluator;
14use super::gmegabuf::rewrite_gmegabuf_writes;
15use super::is_builtin_ident;
16use super::rewriters::{
17 rewrite_amp_pipe_to_band_bor, rewrite_arity_mismatched_semis, rewrite_chain_assignments,
18 rewrite_chained_comparisons, rewrite_logical_to_bandbor, rewrite_semis_in_call_args,
19 rewrite_unary_bang_to_bnot, wrap_bare_cmp_assignment, wrap_boolean_assignment_rhs,
20 wrap_chain_args_in_parens, wrap_paren_balanced_cmp,
21};
22
23/// Maximum expression length to prevent DoS attacks.
24pub(super) const MAX_EXPRESSION_LENGTH: usize = 100_000;
25
26/// Clamp on `loop(N, body)` iteration count. MD2 presets in the corpus
27/// run up to `1024 * 1024` (gmegabuf full-clear); allow that plus a small
28/// safety margin so a single rogue preset can't lock the frame thread for
29/// arbitrarily long.
30pub(super) const MAX_LOOP_ITER: i64 = 1 << 21;
31
32// Pre-compiled regex patterns for performance.
33
34static VAR_REGEX: LazyLock<Regex> =
35 LazyLock::new(|| Regex::new(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b").unwrap());
36
37static ASSIGNMENT_REGEX: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r"(\w+)\s*=\s*(-?\d+)([^\d\.]|$)").unwrap());
39
40static IF_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bif\s*\(").unwrap());
41
42// Drop unary `+` that follows `=`, `(`, `,`, `;`, or a binary operator. MD2
43// presets often write `decay = + if(…)` or `if(a; b; +.5*q4)` which evalexpr
44// rejects as "Operator binaire avec un seul opérande". Run the regex twice
45// from the preprocess pass to catch chains like `= + +.5` (two unaries in a
46// row). Arbitrary whitespace between `+` and its operand is allowed: corpus
47// presets do `decay = + milkif(…)`.
48static UNARY_PLUS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
49 Regex::new(r"(^|[=,(;<>?:!*/+%\-])[ \t]*\+[ \t]*(?P<rest>[A-Za-z_.\d(])").unwrap()
50});
51
52// `.5` literal at expression start or after an operator → `0.5`. evalexpr's
53// number grammar requires the leading digit; HLSL/MD2 presets accept the
54// dot-prefixed form.
55static DOT_LITERAL_REGEX: LazyLock<Regex> =
56 LazyLock::new(|| Regex::new(r"(^|[^a-zA-Z0-9_.])\.(?P<d>\d)").unwrap());
57
58// `var = (60)`. The integer-literal-in-parens form trips evalexpr's
59// strict-typed `=` against the Float-init'd LHS. Promote the inner literal
60// to a float so the surrounding parens are harmless.
61static PAREN_INT_RHS_REGEX: LazyLock<Regex> =
62 LazyLock::new(|| Regex::new(r"(?P<head>=\s*\(\s*)(?P<lit>-?\d+)(?P<tail>\s*\))").unwrap());
63
64impl MilkEvaluator {
65 /// Pre-process expression to handle auto-initialization and type conversion.
66 pub(super) fn preprocess_expression(&mut self, expression: &str) -> String {
67 // Strip `// …` line comments before any other rewrite. Every other
68 // regex pass downstream assumes the input is pure expression syntax;
69 // an embedded comment slips through evalexpr and dies with "An
70 // operator expected 1 arguments, but got 0."
71 let expr_owned = strip_line_comments(expression);
72 let expr = expr_owned.trim();
73
74 // MD2 EEL is case-insensitive; presets ship `Sin`, `INT`, `If`,
75 // `Above`, etc. evalexpr only binds the lowercase form. Lowercase any
76 // identifier whose ascii-lowered spelling is a registered builtin;
77 // user variables (random case) stay untouched so the auto-init below
78 // sees them at their original spelling.
79 let expr_lower = lowercase_builtin_idents(expr);
80 let expr = expr_lower.as_str();
81
82 // Extract variable names from the expression using pre-compiled regex.
83 auto_init_vars(self.context_mut(), expr);
84
85 // Convert integer literals to floats in assignments.
86 // e.g., "zoom = 1" -> "zoom = 1.0".
87 let mut result = ASSIGNMENT_REGEX
88 .replace_all(expr, "$1 = $2.0$3")
89 .to_string();
90
91 // Same coercion for the parenthesised form `var = (60)`.
92 result = PAREN_INT_RHS_REGEX
93 .replace_all(&result, "$head$lit.0$tail")
94 .to_string();
95
96 // Replace `if(` with `milkif(` to use the registered float-accepting
97 // conditional.
98 result = IF_REGEX.replace_all(&result, "milkif(").to_string();
99
100 // `a & b` and `a | b` are MD2 numeric AND/OR (returns 1.0 if both/any
101 // non-zero, else 0.0). Walk out from the operator to find the
102 // bracketed/ident/number operand on each side. Idempotent: `&&`,
103 // `||`, `&=`, `|=` stay untouched.
104 result = rewrite_amp_pipe_to_band_bor(&result);
105
106 // `.5` → `0.5`. Loop until fixed point: `replace_all` is
107 // non-overlapping and consumes its delimiter, so chains like
108 // `+.5+.5` need two passes.
109 for _ in 0..3 {
110 let new = DOT_LITERAL_REGEX
111 .replace_all(&result, "${1}0.$d")
112 .to_string();
113 if new == result {
114 break;
115 }
116 result = new;
117 }
118
119 // Drop leading unary `+` that follows `=`, `(`, `,`, `;` or a binary
120 // op. Two passes catch adjacent unaries.
121 for _ in 0..3 {
122 let new = UNARY_PLUS_REGEX.replace_all(&result, "$1$rest").to_string();
123 if new == result {
124 break;
125 }
126 result = new;
127 }
128
129 // `var = (cmp)` → `var = milkif(cmp, 1, 0)`. evalexpr's `=` operator
130 // type-checks the RHS against the LHS's stored type; since auto-init
131 // seeds every variable as Float(0.0), assigning a Boolean produced
132 // by a comparison errors out.
133 result = wrap_boolean_assignment_rhs(&result);
134
135 // Paren-balanced bool wrap. Catches `(lev1-gmegabuf(1)>0)` and
136 // `(y<=(0.4+0.1*cos(mang)))` shapes the regex above misses because
137 // the inner parens defeat the lookahead.
138 result = wrap_paren_balanced_cmp(&result);
139
140 // Bare comparison as the RHS of a simple assignment (`q10 =
141 // rand(100) >= 30`).
142 result = wrap_bare_cmp_assignment(&result);
143
144 // `gmegabuf(<idx>) = <val>` (and the compound forms) → `gmegabuf_set`.
145 result = rewrite_gmegabuf_writes(&result);
146
147 // `a = b = expr` chain assignment → semicolon-chained simple
148 // assignments. evalexpr's `=` returns `Empty`, which would otherwise
149 // break the outer `=`.
150 result = rewrite_chain_assignments(&result);
151
152 // `;`→`,` rewriter inside function-call arg lists. `milkif(cond; then;
153 // else)` and `gmegabuf_set(idx; val)` are common in the corpus.
154 // `loop` / `exec2` / `exec3` / `while` are skipped at their own level
155 // because their interceptors rely on `;`-chain semantics inside the
156 // arg body; nested non-skipped calls inside such a parent are still
157 // processed.
158 result = rewrite_semis_in_call_args(&result);
159
160 // Arity-aware companion: for builtins whose argument count is known
161 // (`milkif`, `clamp`, `pow`, `min`/`max`, `gmegabuf_set`, …), convert
162 // top-level `;` to `,` when the mixed `,` + `;` count exactly fills
163 // the expected arity. Catches `milkif(cond, a; b)` and friends that
164 // the conservative all-or-nothing pass above leaves alone.
165 result = rewrite_arity_mismatched_semis(&result);
166
167 // Wrap `;`-chain THEN-branches in `(...)` for 3-arg `milkif` whose
168 // THEN-branch is a `;`-chain.
169 result = wrap_chain_args_in_parens(&result);
170
171 // Python-style chained comparisons (`a > b <= c <= 1`) → explicit
172 // AND-chain. evalexpr is left-associative on cmps and dies on
173 // `Boolean <= Number`; MD2's EEL2 reads the chain as pairwise AND.
174 result = rewrite_chained_comparisons(&result);
175
176 // `a && b` / `a || b` → `band(a, b)` / `bor(a, b)`. evalexpr's
177 // logical operators are strict Boolean; MD2 EEL2 collapses both
178 // operands to numeric 0/1.
179 result = rewrite_logical_to_bandbor(&result);
180
181 // Unary `!x` → `bnot(x)`. evalexpr's `!` is strict Boolean →
182 // Boolean; MD2 EEL2 reads `!x` as numeric.
183 result = rewrite_unary_bang_to_bnot(&result);
184
185 result
186 }
187}
188
189/// Walk every identifier in `expr` and auto-init unknown variables to 0.
190fn auto_init_vars(ctx: &mut MilkContext, expr: &str) {
191 for cap in VAR_REGEX.captures_iter(expr) {
192 let var_name = &cap[1];
193 if is_builtin_ident(var_name) {
194 continue;
195 }
196 if ctx.get(var_name).is_none() {
197 ctx.set(var_name, 0.0);
198 }
199 }
200}
201
202/// Strip C++-style `// …` line comments from MD2 expression source.
203///
204/// The MD2 `.milk` format isn't formally specified, but in practice
205/// authors append `// notes` to per-frame equations. Our `.milk` parser
206/// splits on `\n`/`;` and ships the trailing fragment to the evaluator,
207/// where evalexpr trips on the unbalanced `/` operator. Run before any
208/// other rewrite so the comment can't contain syntax that later passes
209/// would mistakenly transform.
210pub(super) fn strip_line_comments(s: &str) -> String {
211 let mut out = String::with_capacity(s.len());
212 let bytes = s.as_bytes();
213 let mut i = 0usize;
214 while i < bytes.len() {
215 if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
216 while i < bytes.len() && bytes[i] != b'\n' {
217 i += 1;
218 }
219 continue;
220 }
221 let mut end = i + 1;
222 while end < bytes.len() && (bytes[end] & 0xC0) == 0x80 {
223 end += 1;
224 }
225 out.push_str(&s[i..end]);
226 i = end;
227 }
228 out
229}
230
231/// Lowercase any identifier that matches a registered builtin (Sin → sin,
232/// INT → int, Above → above, etc.) while leaving user variables at their
233/// original case. MD2 EEL is case-insensitive; evalexpr is not.
234fn lowercase_builtin_idents(s: &str) -> String {
235 static IDENT_REGEX: LazyLock<Regex> =
236 LazyLock::new(|| Regex::new(r"\b([A-Za-z_][A-Za-z0-9_]*)\b").unwrap());
237 IDENT_REGEX
238 .replace_all(s, |caps: ®ex::Captures| {
239 let name = &caps[1];
240 // Fast path: already lower-case.
241 if name.bytes().all(|b| !b.is_ascii_uppercase()) {
242 return name.to_string();
243 }
244 let lower = name.to_ascii_lowercase();
245 if is_builtin_ident(&lower) {
246 lower
247 } else {
248 name.to_string()
249 }
250 })
251 .to_string()
252}