onedrop_eval/evaluator/
preprocess.rs

1//! Regex-driven preprocessing pipeline.
2//!
3//! [`MilkEvaluator::preprocess_expression`] is the entry point: it strips
4//! comments, normalises case, auto-initialises undefined variables, then
5//! chains the rewriter passes in [`super::rewriters`] and
6//! [`super::gmegabuf`] to massage MD2's EEL2 dialect into something
7//! `evalexpr` will accept.
8
9use crate::context::MilkContext;
10use regex::Regex;
11use std::sync::LazyLock;
12
13use super::MilkEvaluator;
14use super::gmegabuf::rewrite_gmegabuf_writes;
15use super::is_builtin_ident;
16use super::rewriters::{
17    rewrite_amp_pipe_to_band_bor, rewrite_arity_mismatched_semis, rewrite_chain_assignments,
18    rewrite_chained_comparisons, rewrite_logical_to_bandbor, rewrite_semis_in_call_args,
19    rewrite_unary_bang_to_bnot, wrap_bare_cmp_assignment, wrap_boolean_assignment_rhs,
20    wrap_chain_args_in_parens, wrap_paren_balanced_cmp,
21};
22
23/// Maximum expression length to prevent DoS attacks.
24pub(super) const MAX_EXPRESSION_LENGTH: usize = 100_000;
25
26/// Clamp on `loop(N, body)` iteration count. MD2 presets in the corpus
27/// run up to `1024 * 1024` (gmegabuf full-clear); allow that plus a small
28/// safety margin so a single rogue preset can't lock the frame thread for
29/// arbitrarily long.
30pub(super) const MAX_LOOP_ITER: i64 = 1 << 21;
31
32// Pre-compiled regex patterns for performance.
33
34static VAR_REGEX: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b").unwrap());
36
37static ASSIGNMENT_REGEX: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"(\w+)\s*=\s*(-?\d+)([^\d\.]|$)").unwrap());
39
40static IF_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\bif\s*\(").unwrap());
41
42// Drop unary `+` that follows `=`, `(`, `,`, `;`, or a binary operator. MD2
43// presets often write `decay = + if(…)` or `if(a; b; +.5*q4)` which evalexpr
44// rejects as "Operator binaire avec un seul opérande". Run the regex twice
45// from the preprocess pass to catch chains like `= + +.5` (two unaries in a
46// row). Arbitrary whitespace between `+` and its operand is allowed: corpus
47// presets do `decay = + milkif(…)`.
48static UNARY_PLUS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
49    Regex::new(r"(^|[=,(;<>?:!*/+%\-])[ \t]*\+[ \t]*(?P<rest>[A-Za-z_.\d(])").unwrap()
50});
51
52// `.5` literal at expression start or after an operator → `0.5`. evalexpr's
53// number grammar requires the leading digit; HLSL/MD2 presets accept the
54// dot-prefixed form.
55static DOT_LITERAL_REGEX: LazyLock<Regex> =
56    LazyLock::new(|| Regex::new(r"(^|[^a-zA-Z0-9_.])\.(?P<d>\d)").unwrap());
57
58// `var = (60)`. The integer-literal-in-parens form trips evalexpr's
59// strict-typed `=` against the Float-init'd LHS. Promote the inner literal
60// to a float so the surrounding parens are harmless.
61static PAREN_INT_RHS_REGEX: LazyLock<Regex> =
62    LazyLock::new(|| Regex::new(r"(?P<head>=\s*\(\s*)(?P<lit>-?\d+)(?P<tail>\s*\))").unwrap());
63
64impl MilkEvaluator {
65    /// Pre-process expression to handle auto-initialization and type conversion.
66    pub(super) fn preprocess_expression(&mut self, expression: &str) -> String {
67        // Strip `// …` line comments before any other rewrite. Every other
68        // regex pass downstream assumes the input is pure expression syntax;
69        // an embedded comment slips through evalexpr and dies with "An
70        // operator expected 1 arguments, but got 0."
71        let expr_owned = strip_line_comments(expression);
72        let expr = expr_owned.trim();
73
74        // MD2 EEL is case-insensitive; presets ship `Sin`, `INT`, `If`,
75        // `Above`, etc. evalexpr only binds the lowercase form. Lowercase any
76        // identifier whose ascii-lowered spelling is a registered builtin;
77        // user variables (random case) stay untouched so the auto-init below
78        // sees them at their original spelling.
79        let expr_lower = lowercase_builtin_idents(expr);
80        let expr = expr_lower.as_str();
81
82        // Extract variable names from the expression using pre-compiled regex.
83        auto_init_vars(self.context_mut(), expr);
84
85        // Convert integer literals to floats in assignments.
86        // e.g., "zoom = 1" -> "zoom = 1.0".
87        let mut result = ASSIGNMENT_REGEX
88            .replace_all(expr, "$1 = $2.0$3")
89            .to_string();
90
91        // Same coercion for the parenthesised form `var = (60)`.
92        result = PAREN_INT_RHS_REGEX
93            .replace_all(&result, "$head$lit.0$tail")
94            .to_string();
95
96        // Replace `if(` with `milkif(` to use the registered float-accepting
97        // conditional.
98        result = IF_REGEX.replace_all(&result, "milkif(").to_string();
99
100        // `a & b` and `a | b` are MD2 numeric AND/OR (returns 1.0 if both/any
101        // non-zero, else 0.0). Walk out from the operator to find the
102        // bracketed/ident/number operand on each side. Idempotent: `&&`,
103        // `||`, `&=`, `|=` stay untouched.
104        result = rewrite_amp_pipe_to_band_bor(&result);
105
106        // `.5` → `0.5`. Loop until fixed point: `replace_all` is
107        // non-overlapping and consumes its delimiter, so chains like
108        // `+.5+.5` need two passes.
109        for _ in 0..3 {
110            let new = DOT_LITERAL_REGEX
111                .replace_all(&result, "${1}0.$d")
112                .to_string();
113            if new == result {
114                break;
115            }
116            result = new;
117        }
118
119        // Drop leading unary `+` that follows `=`, `(`, `,`, `;` or a binary
120        // op. Two passes catch adjacent unaries.
121        for _ in 0..3 {
122            let new = UNARY_PLUS_REGEX.replace_all(&result, "$1$rest").to_string();
123            if new == result {
124                break;
125            }
126            result = new;
127        }
128
129        // `var = (cmp)` → `var = milkif(cmp, 1, 0)`. evalexpr's `=` operator
130        // type-checks the RHS against the LHS's stored type; since auto-init
131        // seeds every variable as Float(0.0), assigning a Boolean produced
132        // by a comparison errors out.
133        result = wrap_boolean_assignment_rhs(&result);
134
135        // Paren-balanced bool wrap. Catches `(lev1-gmegabuf(1)>0)` and
136        // `(y<=(0.4+0.1*cos(mang)))` shapes the regex above misses because
137        // the inner parens defeat the lookahead.
138        result = wrap_paren_balanced_cmp(&result);
139
140        // Bare comparison as the RHS of a simple assignment (`q10 =
141        // rand(100) >= 30`).
142        result = wrap_bare_cmp_assignment(&result);
143
144        // `gmegabuf(<idx>) = <val>` (and the compound forms) → `gmegabuf_set`.
145        result = rewrite_gmegabuf_writes(&result);
146
147        // `a = b = expr` chain assignment → semicolon-chained simple
148        // assignments. evalexpr's `=` returns `Empty`, which would otherwise
149        // break the outer `=`.
150        result = rewrite_chain_assignments(&result);
151
152        // `;`→`,` rewriter inside function-call arg lists. `milkif(cond; then;
153        // else)` and `gmegabuf_set(idx; val)` are common in the corpus.
154        // `loop` / `exec2` / `exec3` / `while` are skipped at their own level
155        // because their interceptors rely on `;`-chain semantics inside the
156        // arg body; nested non-skipped calls inside such a parent are still
157        // processed.
158        result = rewrite_semis_in_call_args(&result);
159
160        // Arity-aware companion: for builtins whose argument count is known
161        // (`milkif`, `clamp`, `pow`, `min`/`max`, `gmegabuf_set`, …), convert
162        // top-level `;` to `,` when the mixed `,` + `;` count exactly fills
163        // the expected arity. Catches `milkif(cond, a; b)` and friends that
164        // the conservative all-or-nothing pass above leaves alone.
165        result = rewrite_arity_mismatched_semis(&result);
166
167        // Wrap `;`-chain THEN-branches in `(...)` for 3-arg `milkif` whose
168        // THEN-branch is a `;`-chain.
169        result = wrap_chain_args_in_parens(&result);
170
171        // Python-style chained comparisons (`a > b <= c <= 1`) → explicit
172        // AND-chain. evalexpr is left-associative on cmps and dies on
173        // `Boolean <= Number`; MD2's EEL2 reads the chain as pairwise AND.
174        result = rewrite_chained_comparisons(&result);
175
176        // `a && b` / `a || b` → `band(a, b)` / `bor(a, b)`. evalexpr's
177        // logical operators are strict Boolean; MD2 EEL2 collapses both
178        // operands to numeric 0/1.
179        result = rewrite_logical_to_bandbor(&result);
180
181        // Unary `!x` → `bnot(x)`. evalexpr's `!` is strict Boolean →
182        // Boolean; MD2 EEL2 reads `!x` as numeric.
183        result = rewrite_unary_bang_to_bnot(&result);
184
185        result
186    }
187}
188
189/// Walk every identifier in `expr` and auto-init unknown variables to 0.
190fn auto_init_vars(ctx: &mut MilkContext, expr: &str) {
191    for cap in VAR_REGEX.captures_iter(expr) {
192        let var_name = &cap[1];
193        if is_builtin_ident(var_name) {
194            continue;
195        }
196        if ctx.get(var_name).is_none() {
197            ctx.set(var_name, 0.0);
198        }
199    }
200}
201
202/// Strip C++-style `// …` line comments from MD2 expression source.
203///
204/// The MD2 `.milk` format isn't formally specified, but in practice
205/// authors append `// notes` to per-frame equations. Our `.milk` parser
206/// splits on `\n`/`;` and ships the trailing fragment to the evaluator,
207/// where evalexpr trips on the unbalanced `/` operator. Run before any
208/// other rewrite so the comment can't contain syntax that later passes
209/// would mistakenly transform.
210pub(super) fn strip_line_comments(s: &str) -> String {
211    let mut out = String::with_capacity(s.len());
212    let bytes = s.as_bytes();
213    let mut i = 0usize;
214    while i < bytes.len() {
215        if i + 1 < bytes.len() && bytes[i] == b'/' && bytes[i + 1] == b'/' {
216            while i < bytes.len() && bytes[i] != b'\n' {
217                i += 1;
218            }
219            continue;
220        }
221        let mut end = i + 1;
222        while end < bytes.len() && (bytes[end] & 0xC0) == 0x80 {
223            end += 1;
224        }
225        out.push_str(&s[i..end]);
226        i = end;
227    }
228    out
229}
230
231/// Lowercase any identifier that matches a registered builtin (Sin → sin,
232/// INT → int, Above → above, etc.) while leaving user variables at their
233/// original case. MD2 EEL is case-insensitive; evalexpr is not.
234fn lowercase_builtin_idents(s: &str) -> String {
235    static IDENT_REGEX: LazyLock<Regex> =
236        LazyLock::new(|| Regex::new(r"\b([A-Za-z_][A-Za-z0-9_]*)\b").unwrap());
237    IDENT_REGEX
238        .replace_all(s, |caps: &regex::Captures| {
239            let name = &caps[1];
240            // Fast path: already lower-case.
241            if name.bytes().all(|b| !b.is_ascii_uppercase()) {
242                return name.to_string();
243            }
244            let lower = name.to_ascii_lowercase();
245            if is_builtin_ident(&lower) {
246                lower
247            } else {
248                name.to_string()
249            }
250        })
251        .to_string()
252}