onedrop_engine/engine/
wave_phase.rs

1//! Custom-wave (`wavecode_N`) phase of the engine update loop.
2//!
3//! Runs each enabled wave's pre-compiled `per_frame_init` (once per preset
4//! load), `per_frame`, and `per_point` blocks, then returns the resulting
5//! vertex stream + per-wave batches. Callers (engine) push the result to
6//! the right [`onedrop_renderer::RenderChain`] — primary for the current
7//! preset, secondary for the outgoing preset during a transition.
8//!
9//! ## Parallelism
10//!
11//! Each of the up-to-4 enabled wavecode blocks is independent (per-MD2,
12//! waves don't share state across blocks within one frame), so the four
13//! per-wave per_frame + 512-sample loops fan out across rayon workers.
14//! The dominant cost in 144.milk / 207.milk class presets is custom
15//! waves' per_point eval (~37 ms / frame on a 12-thread box,
16//! single-threaded). After this parallelisation: ~10-12 ms.
17
18use super::preset_slot::{CompiledWave, PresetSlot, run_block_with_logger};
19use onedrop_eval::{MilkEvaluator, WavePoint};
20use onedrop_parser::preset::WaveCode;
21use onedrop_renderer::{CustomWaveBatch, CustomWaveVertex};
22use rayon::prelude::*;
23
24/// Minimum sample count to bother parallelising the per_point loop.
25/// Below this, rayon's map_init overhead (per-worker `MilkEvaluator`
26/// clone + work-stealing dispatch) exceeds the eval time we'd save —
27/// stay sequential. 64 lines up roughly with the smallest "dense" waves
28/// we care about (the typical preset uses 256-512).
29const PARALLEL_SAMPLES_THRESHOLD: usize = 64;
30
31/// Output of one wave's sample loop: its vertex slice + the batch metadata
32/// describing how to draw it. Collected per wave so the parallel pass
33/// doesn't need a shared mutex on the global vertex buffer.
34struct WaveOutput {
35    vertices: Vec<CustomWaveVertex>,
36    batch: Option<CustomWaveBatch>,
37}
38
39/// Run every enabled `wavecode_N` block in `slot`'s preset and return the
40/// per-frame vertex stream + batches. Returns empty `Vec`s when no preset is
41/// loaded or no waves are enabled — feed those through to the renderer's
42/// `update_custom_waves` to reset its buffers.
43pub(super) fn compute_custom_waves(
44    slot: &mut PresetSlot,
45    left: &[f32],
46    right: &[f32],
47    spectrum_bins: &[f32],
48) -> (Vec<CustomWaveVertex>, Vec<CustomWaveBatch>) {
49    let Some(preset) = slot.preset.as_ref() else {
50        return (Vec::new(), Vec::new());
51    };
52    if preset.waves.is_empty()
53        || preset.waves.iter().all(|w| !w.enabled)
54        || slot.compiled_waves.is_empty()
55    {
56        return (Vec::new(), Vec::new());
57    }
58
59    // Per-wave parallel pass. Each worker gets its own `MilkEvaluator`
60    // clone, runs the wave's init (if needed) + per_frame + the 512-sample
61    // per_point loop, and returns its vertex slice. Sample loop *inside*
62    // a wave stays sequential because `current` threads state across
63    // samples — per-MD2 idiomatic.
64    //
65    // Disjoint field-borrows on `slot` let us pass `&preset.waves`,
66    // `&slot.compiled_waves` and `&slot.waves_need_init` straight into
67    // the closure — no per-frame `Vec<WaveCode>` clone (each
68    // `WaveCode` carries `Vec<String>` equation lists, so cloning all
69    // 4 of them every frame was actively burning allocator time).
70    let wave_outputs: Vec<WaveOutput> = {
71        let waves: &[WaveCode] = &preset.waves;
72        let base_eval = slot.evaluator.clone();
73        let compiled_waves = &slot.compiled_waves;
74        let needs_init = &slot.waves_need_init;
75        waves
76            .par_iter()
77            .enumerate()
78            .map(|(i, wave)| {
79                compute_one_wave(
80                    i,
81                    wave,
82                    compiled_waves.get(i),
83                    needs_init.get(i).copied().unwrap_or(false),
84                    base_eval.clone(),
85                    left,
86                    right,
87                    spectrum_bins,
88                )
89            })
90            .collect()
91    };
92
93    // Mark all waves as initialised — the actual init ran in worker
94    // clones, but the flag lives on the main slot.
95    for flag in slot.waves_need_init.iter_mut() {
96        *flag = false;
97    }
98
99    // Stitch per-wave vertex slices into one contiguous buffer; fix up the
100    // batch offsets to point at the global vertex array.
101    let total_verts: usize = wave_outputs.iter().map(|o| o.vertices.len()).sum();
102    let mut vertices: Vec<CustomWaveVertex> = Vec::with_capacity(total_verts);
103    let mut batches: Vec<CustomWaveBatch> = Vec::with_capacity(wave_outputs.len());
104    for out in wave_outputs {
105        let offset = vertices.len() as u32;
106        let count = out.vertices.len() as u32;
107        vertices.extend(out.vertices);
108        if let Some(mut batch) = out.batch {
109            batch.start_vertex = offset;
110            batch.vertex_count = count;
111            batches.push(batch);
112        }
113    }
114
115    (vertices, batches)
116}
117
118#[allow(clippy::too_many_arguments)]
119fn compute_one_wave(
120    i: usize,
121    wave: &WaveCode,
122    cw_opt: Option<&CompiledWave>,
123    needs_init: bool,
124    mut eval: MilkEvaluator,
125    left: &[f32],
126    right: &[f32],
127    spectrum_bins: &[f32],
128) -> WaveOutput {
129    let Some(cw) = cw_opt else {
130        return WaveOutput {
131            vertices: Vec::new(),
132            batch: None,
133        };
134    };
135    if !wave.enabled {
136        return WaveOutput {
137            vertices: Vec::new(),
138            batch: None,
139        };
140    }
141
142    if needs_init {
143        run_block_with_logger(&mut eval, &cw.init, |idx, e| {
144            log::warn!("wave[{}] init equation[{}] failed: {}", i, idx, e);
145        });
146    }
147    run_block_with_logger(&mut eval, &cw.per_frame, |idx, e| {
148        log::warn!("wave[{}] per_frame equation[{}] failed: {}", i, idx, e);
149    });
150
151    let samples = wave.samples.clamp(2, 512) as usize;
152    let scaled = wave.scaling as f64;
153    let mut current = WavePoint {
154        sample: 0.0,
155        value1: 0.0,
156        value2: 0.0,
157        x: 0.5,
158        y: 0.5,
159        r: wave.r as f64,
160        g: wave.g as f64,
161        b: wave.b as f64,
162        a: (wave.a as f64).clamp(0.0, 1.0),
163    };
164
165    // Decide once whether to fan samples out across rayon workers.
166    // `analyse_per_point` confirmed the block has no read of
167    // `x`/`y`/`r`/`g`/`b`/`a` before write, no writes to qN/custom
168    // vars, and no stateful function calls — so sample S does not feed
169    // sample S+1 and we're free to reorder them. Tiny waves stay
170    // sequential because rayon's per-worker `MilkEvaluator` clone
171    // (~few µs) would cost more than the eval saves.
172    let parallel_samples = !cw.per_point.is_empty()
173        && cw.per_point_parallelism.is_safe()
174        && samples >= PARALLEL_SAMPLES_THRESHOLD;
175
176    // Pre-compute the per-sample (v1, v2) audio/FFT pair so both paths
177    // share the same input mapping. Time-domain mode pulls `value1`
178    // from the left channel and `value2` from the right; b_spectrum
179    // mode keeps the N/2 mirror trick because the upstream FFT runs
180    // on a downmix (one buffer of magnitudes) and there's no
181    // meaningful "stereo bin" pair.
182    let sample_inputs: Vec<(f64, f64, f64)> = (0..samples)
183        .map(|s| {
184            let sample_norm = s as f64 / (samples - 1) as f64;
185            let (v1, v2) = if wave.b_spectrum {
186                let n = spectrum_bins.len().max(1);
187                let idx = ((sample_norm * (n - 1) as f64) as usize).min(n - 1);
188                let mirror = (idx + n / 2) % n;
189                (spectrum_bins[idx] as f64, spectrum_bins[mirror] as f64)
190            } else if !left.is_empty() {
191                let n_l = left.len();
192                let idx_l = (s * n_l / samples).min(n_l - 1);
193                let v1 = left[idx_l] as f64;
194                // Right channel — fall back to left if the caller
195                // passed an empty/short right buffer (mono callers
196                // routed through `update`). When stereo is real,
197                // `right` is the same length as `left`.
198                let v2 = if !right.is_empty() {
199                    let n_r = right.len();
200                    let idx_r = (s * n_r / samples).min(n_r - 1);
201                    right[idx_r] as f64
202                } else {
203                    v1
204                };
205                (v1, v2)
206            } else {
207                (0.0, 0.0)
208            };
209            (sample_norm, v1 * scaled, v2 * scaled)
210        })
211        .collect();
212
213    // Three geometry modes:
214    // - dots: one screen-space quad per point (6 verts × samples)
215    // - thick lines (!dots && b_draw_thick): one quad per *segment*
216    //   between consecutive points (6 verts × (samples - 1))
217    // - thin lines (!dots && !b_draw_thick): LineStrip, one vert per
218    //   point — 1 px wide
219    let thick_lines = !wave.b_use_dots && wave.b_draw_thick;
220    let mut vertices: Vec<CustomWaveVertex> = if wave.b_use_dots {
221        Vec::with_capacity(samples * 6)
222    } else if thick_lines {
223        Vec::with_capacity(samples.saturating_sub(1) * 6)
224    } else {
225        Vec::with_capacity(samples)
226    };
227
228    // Half-thickness of the thick-line stroke, in clip space. ~0.006
229    // matches the static waveform pass's `b_wave_thick` lookup, so
230    // both flavours of thick lines look identical at the same render
231    // resolution. Dots stay at the historical 0.012 / 0.006 split.
232    const THICK_LINE_CLIP: f32 = 0.006;
233
234    // Body shared by all paths: turn a per-eval point into the vertex
235    // stream the renderer consumes. For thick lines, the segment quad
236    // needs two consecutive points; the caller threads `prev_thick`
237    // through and emits the quad once both ends exist.
238    let push_vertices = |vertices: &mut Vec<CustomWaveVertex>,
239                         prev_thick: &mut Option<WavePoint>,
240                         out: WavePoint| {
241        let pos = onedrop_renderer::custom_wave::preset_xy_to_clip(out.x, out.y);
242        let color = [
243            out.r.clamp(0.0, 1.0) as f32,
244            out.g.clamp(0.0, 1.0) as f32,
245            out.b.clamp(0.0, 1.0) as f32,
246            out.a.clamp(0.0, 1.0) as f32,
247        ];
248        if wave.b_use_dots {
249            let radius = if wave.b_draw_thick { 0.012 } else { 0.006 };
250            let quad =
251                onedrop_renderer::custom_wave::point_to_dot_quad(out.x, out.y, color, radius);
252            vertices.extend_from_slice(&quad);
253        } else if thick_lines {
254            if let Some(prev) = *prev_thick {
255                let color_prev = [
256                    prev.r.clamp(0.0, 1.0) as f32,
257                    prev.g.clamp(0.0, 1.0) as f32,
258                    prev.b.clamp(0.0, 1.0) as f32,
259                    prev.a.clamp(0.0, 1.0) as f32,
260                ];
261                let quad = onedrop_renderer::custom_wave::segment_to_thick_quad(
262                    prev.x,
263                    prev.y,
264                    color_prev,
265                    out.x,
266                    out.y,
267                    color,
268                    THICK_LINE_CLIP,
269                );
270                vertices.extend_from_slice(&quad);
271            }
272            *prev_thick = Some(out);
273        } else {
274            vertices.push(CustomWaveVertex { pos, color });
275        }
276    };
277
278    // Bytecode VM path is preferred when the per_point block lowered
279    // cleanly (no `rand` / `gmegabuf` / unsupported ops). It skips
280    // evalexpr's tree-walking dispatch and runs a flat opcode loop
281    // against array-backed hot vars. The Nodes path stays as
282    // observation-compatible fallback.
283    let per_point_bc = cw.per_point_bytecode.as_ref();
284
285    if parallel_samples {
286        // Carry-free path: every sample starts from the same `current`
287        // seed (post-per_frame, pre-loop). Splitting across rayon
288        // workers gives a ~4-8× speedup on dense waves. Each worker
289        // gets one `MilkEvaluator` clone (~slim — see the eval crate's
290        // `Clone` impl), amortised across its slice of the sample
291        // range.
292        let seed = current;
293        let per_point = &cw.per_point;
294        let base_eval = &eval;
295        let outputs: Vec<WavePoint> = sample_inputs
296            .par_iter()
297            .map_init(
298                || base_eval.clone(),
299                |worker_eval, &(sample_norm, v1s, v2s)| {
300                    let mut p = seed;
301                    p.sample = sample_norm;
302                    p.value1 = v1s;
303                    p.value2 = v2s;
304                    if let Some(bc) = per_point_bc {
305                        worker_eval.run_per_point_bc(p, bc)
306                    } else {
307                        worker_eval.eval_per_point(p, per_point).unwrap_or(p)
308                    }
309                },
310            )
311            .collect();
312        let mut prev_thick: Option<WavePoint> = None;
313        for out in outputs {
314            push_vertices(&mut vertices, &mut prev_thick, out);
315        }
316    } else {
317        // Sequential path — preserves MD2's "trail across samples"
318        // semantics for blocks that read x/y/r/g/b/a as carry.
319        let mut prev_thick: Option<WavePoint> = None;
320        for (s, &(sample_norm, v1s, v2s)) in sample_inputs.iter().enumerate() {
321            current.sample = sample_norm;
322            current.value1 = v1s;
323            current.value2 = v2s;
324            let out = if cw.per_point.is_empty() {
325                WavePoint {
326                    x: sample_norm,
327                    y: 0.5 - v1s * 0.5,
328                    ..current
329                }
330            } else if let Some(bc) = per_point_bc {
331                eval.run_per_point_bc(current, bc)
332            } else {
333                match eval.eval_per_point(current, &cw.per_point) {
334                    Ok(p) => p,
335                    Err(e) => {
336                        log::warn!("wave[{}] per_point failed at s={}: {}", i, s, e);
337                        current
338                    }
339                }
340            };
341            current = out;
342            push_vertices(&mut vertices, &mut prev_thick, out);
343        }
344    }
345
346    let count = vertices.len() as u32;
347    let batch = if count > 0 {
348        Some(CustomWaveBatch {
349            start_vertex: 0, // fixed up by caller after stitching
350            vertex_count: count,
351            // Thick lines emit triangle quads CPU-side, so dispatch
352            // them through the same TriangleList pipeline the dots
353            // path uses. The flag name kept for backward compat —
354            // see `CustomWaveBatch::dots` doc.
355            dots: wave.b_use_dots || thick_lines,
356            additive: wave.b_additive,
357        })
358    } else {
359        None
360    };
361
362    WaveOutput { vertices, batch }
363}
364
365#[cfg(test)]
366mod tests {
367    use super::*;
368    use onedrop_eval::{MilkEvaluator, PerPointParallelism, analyse_per_point};
369    use onedrop_parser::preset::WaveCode;
370
371    /// Compile a small per_point block in isolation so we can drive
372    /// `compute_one_wave` from a unit test without a full preset.
373    fn compile_per_point(eqs: &[&str]) -> (Vec<onedrop_eval::Node>, PerPointParallelism) {
374        let mut eval = MilkEvaluator::new();
375        let owned: Vec<String> = eqs.iter().map(|s| s.to_string()).collect();
376        let nodes = eval.compile_batch(&owned).unwrap();
377        let par = analyse_per_point(&nodes);
378        (nodes, par)
379    }
380
381    fn make_wave(samples: i32) -> WaveCode {
382        WaveCode {
383            index: 0,
384            enabled: true,
385            samples,
386            sep: 0,
387            b_spectrum: false,
388            b_use_dots: false,
389            b_draw_thick: false,
390            b_additive: false,
391            scaling: 1.0,
392            smoothing: 0.0,
393            r: 0.25,
394            g: 0.5,
395            b: 0.75,
396            a: 1.0,
397            per_frame_equations: Vec::new(),
398            per_point_equations: Vec::new(),
399            per_frame_init_equations: Vec::new(),
400        }
401    }
402
403    fn run_wave(
404        wave: &WaveCode,
405        cw: &CompiledWave,
406        audio: &[f32],
407        spectrum: &[f32],
408    ) -> Vec<CustomWaveVertex> {
409        let eval = MilkEvaluator::new();
410        // Tests pass mono audio; route left = right so the time-domain
411        // path's `value2` falls back to mirroring `value1`.
412        compute_one_wave(0, wave, Some(cw), false, eval, audio, audio, spectrum).vertices
413    }
414
415    /// Carry-free per_point block: parallel and sequential paths must
416    /// produce vertex-for-vertex identical output. This is the
417    /// correctness anchor for `analyse_per_point` + the rayon fan-out.
418    #[test]
419    fn parallel_and_sequential_match_on_carry_free_block() {
420        let (nodes, par) = compile_per_point(&[
421            "x = sample",
422            "y = 0.5 - value1 * 0.5",
423            "r = sample",
424            "g = 1 - sample",
425            "b = 0.5",
426            "a = 1",
427        ]);
428        assert_eq!(par, PerPointParallelism::Safe);
429
430        // Force sequential: pretend carry was detected.
431        let cw_seq = CompiledWave {
432            init: onedrop_eval::CompiledBlock::empty(),
433            per_frame: onedrop_eval::CompiledBlock::empty(),
434            per_point: nodes.clone(),
435            per_point_parallelism: PerPointParallelism::Sequential,
436            per_point_bytecode: None,
437        };
438        // Run parallel: real analyser verdict.
439        let cw_par = CompiledWave {
440            init: onedrop_eval::CompiledBlock::empty(),
441            per_frame: onedrop_eval::CompiledBlock::empty(),
442            per_point: nodes,
443            per_point_parallelism: par,
444            per_point_bytecode: None,
445        };
446
447        // 256 samples > 64 threshold so the parallel path is actually
448        // taken — anything below threshold silently falls back to
449        // sequential and the test wouldn't prove anything.
450        let wave = make_wave(256);
451        let audio: Vec<f32> = (0..512).map(|i| (i as f32 * 0.01).sin()).collect();
452
453        let seq = run_wave(&wave, &cw_seq, &audio, &[]);
454        let par_out = run_wave(&wave, &cw_par, &audio, &[]);
455
456        assert_eq!(seq.len(), par_out.len());
457        for (a, b) in seq.iter().zip(par_out.iter()) {
458            assert_eq!(a.pos, b.pos, "vertex pos divergence");
459            assert_eq!(a.color, b.color, "vertex color divergence");
460        }
461    }
462
463    /// Carry-dependent block must stay on the sequential path even
464    /// when the engine wires up `PerPointParallelism::Safe` somehow
465    /// (defence in depth: the analyser is the gatekeeper, but a
466    /// carry-needing block run in parallel would silently produce a
467    /// different output and we want a test that pins that).
468    #[test]
469    fn carry_block_is_marked_sequential_by_analyser() {
470        let (_, par) = compile_per_point(&["x = x + 0.001", "y = sample"]);
471        assert_eq!(par, PerPointParallelism::Sequential);
472    }
473
474    /// Parity anchor for the bytecode VM at the wave level: same
475    /// per_point block, one CompiledWave with the bytecode path
476    /// enabled and one with it disabled. Vertex output must be
477    /// identical (or differ by at most a float ulp, which `assert_eq`
478    /// on `[f32; 4]` color and clip-space pos catches because both
479    /// paths run the same arithmetic on the same f64s).
480    #[test]
481    fn bytecode_and_evalexpr_paths_match() {
482        let (nodes, par) = compile_per_point(&[
483            "x = sample",
484            "y = 0.5 - value1 * 0.5",
485            "r = sin(sample * 6.2831853) * 0.5 + 0.5",
486            "g = 1.0 - sample",
487            "b = sqrt(abs(value1))",
488            "a = 1.0",
489        ]);
490        assert_eq!(par, PerPointParallelism::Safe);
491
492        // The nodes were compiled against a throw-away evaluator inside
493        // `compile_per_point`; for bytecode lowering we only need any
494        // `MilkContext` whose cold slab can intern the same names.
495        let mut bc_eval = MilkEvaluator::new();
496        let bc = onedrop_eval::CompiledBytecode::try_compile(&nodes, bc_eval.context_mut())
497            .expect("block lowers to bytecode");
498
499        let cw_nodes = CompiledWave {
500            init: onedrop_eval::CompiledBlock::empty(),
501            per_frame: onedrop_eval::CompiledBlock::empty(),
502            per_point: nodes.clone(),
503            per_point_parallelism: PerPointParallelism::Sequential,
504            per_point_bytecode: None,
505        };
506        let cw_bc = CompiledWave {
507            init: onedrop_eval::CompiledBlock::empty(),
508            per_frame: onedrop_eval::CompiledBlock::empty(),
509            per_point: nodes,
510            per_point_parallelism: PerPointParallelism::Sequential,
511            per_point_bytecode: Some(bc),
512        };
513
514        let wave = make_wave(128);
515        let audio: Vec<f32> = (0..512).map(|i| (i as f32 * 0.013).sin()).collect();
516        let from_nodes = run_wave(&wave, &cw_nodes, &audio, &[]);
517        let from_bc = run_wave(&wave, &cw_bc, &audio, &[]);
518
519        assert_eq!(from_nodes.len(), from_bc.len());
520        for (a, b) in from_nodes.iter().zip(from_bc.iter()) {
521            assert_eq!(a.pos, b.pos, "pos divergence");
522            assert_eq!(a.color, b.color, "color divergence");
523        }
524    }
525}