diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e6fd184..f90e7980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,16 @@ between threads, but tapes were _already_ using an `Arc<..>` under the hood. - Changed `Tape::recycle` from returning a `Storage` to returning an `Option`, as tapes may now be shared between threads. +- Use Rayon for 2D and 3D rasterization + - The `threads` member of `VoxelRenderConfig` and `ImageRenderConfig` is now + a `Option`, which can be `None` (use a single thread), + `Some(ThreadPool::Global)` (use the global Rayon pool), or + `Some(ThreadPool::Custom(..))` (use a user-provided pool) + - This is a step towards WebAssembly multithreading, using + `wasm-bindgen-rayon`. + - `ThreadCount` is moved to `fidget::mesh`, because that's the only place + it's now used + - The plan is to switch to Rayon for meshing as well, eventually # 0.3.3 - `Function` and evaluator types now produce multiple outputs diff --git a/Cargo.lock b/Cargo.lock index b620cbbd..4542cb51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "ab_glyph" @@ -851,6 +851,7 @@ dependencies = [ "num-traits", "ordered-float", "rand", + "rayon", "rhai", "serde", "static_assertions", @@ -870,6 +871,7 @@ dependencies = [ "image", "log", "nalgebra", + "rayon", "workspace-hack", ] @@ -1296,7 +1298,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.4", ] [[package]] @@ -2029,9 +2031,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" [[package]] name = "rayon" -version = "1.8.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", @@ -3041,7 +3043,7 @@ dependencies = [ "bytemuck", "clap", "clap_builder", - "crossbeam-utils", + "either", "getrandom", "libc", "log", diff --git a/Cargo.toml b/Cargo.toml index d3914821..611b1a3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ notify = "7.0" num-traits = "0.2" ordered-float = "4" rand = "0.8.5" +rayon = "1.10" rhai = { version = "1.17", features = ["sync"] } serde = { version = "1.0", features = ["derive", "rc"] } static_assertions = "1" diff --git a/demos/cli/Cargo.toml b/demos/cli/Cargo.toml index b477bec6..1b266e47 100644 --- a/demos/cli/Cargo.toml +++ b/demos/cli/Cargo.toml @@ -11,6 +11,7 @@ env_logger.workspace = true image.workspace = true log.workspace = true nalgebra.workspace = true +rayon.workspace = true fidget.path = "../../fidget" workspace-hack = { version = "0.1", path = "../../workspace-hack" } diff --git a/demos/cli/src/main.rs b/demos/cli/src/main.rs index 08e6d7c7..3eed9fa8 100644 --- a/demos/cli/src/main.rs +++ b/demos/cli/src/main.rs @@ -73,8 +73,8 @@ struct ImageSettings { eval: EvalMode, /// Number of threads to use - #[clap(short, long, default_value_t = NonZeroUsize::new(8).unwrap())] - threads: NonZeroUsize, + #[clap(short, long)] + threads: Option, /// Number of times to render (for benchmarking) #[clap(short = 'N', default_value_t = 1)] @@ -119,10 +119,24 @@ fn run3d( if !isometric { *mat.matrix_mut().get_mut((3, 2)).unwrap() = 0.3; } + let pool: Option; + let threads = match settings.threads { + Some(n) if n.get() == 1 => None, + Some(n) => { + pool = Some( + rayon::ThreadPoolBuilder::new() + .num_threads(n.get()) + .build() + .unwrap(), + ); + pool.as_ref().map(fidget::render::ThreadPool::Custom) + } + None => Some(fidget::render::ThreadPool::Global), + }; let cfg = fidget::render::VoxelRenderConfig { image_size: fidget::render::VoxelSize::from(settings.size), tile_sizes: F::tile_sizes_3d(), - threads: settings.threads.into(), + threads, ..Default::default() }; let shape = shape.apply_transform(mat.into()); @@ -197,10 +211,24 @@ fn run2d( .flat_map(|i| i.into_iter()) .collect() } else { + let pool: Option; + let threads = match settings.threads { + Some(n) if n.get() == 1 => None, + Some(n) => { + pool = Some( + rayon::ThreadPoolBuilder::new() + .num_threads(n.get()) + .build() + .unwrap(), + ); + pool.as_ref().map(fidget::render::ThreadPool::Custom) + } + None => Some(fidget::render::ThreadPool::Global), + }; let cfg = fidget::render::ImageRenderConfig { image_size: fidget::render::ImageSize::from(settings.size), tile_sizes: F::tile_sizes_2d(), - threads: settings.threads.into(), + threads, ..Default::default() }; if sdf { diff --git a/fidget/Cargo.toml b/fidget/Cargo.toml index 6a32313c..baf42a24 100644 --- a/fidget/Cargo.toml +++ b/fidget/Cargo.toml @@ -18,6 +18,7 @@ nalgebra.workspace = true num-traits.workspace = true ordered-float.workspace = true rand.workspace = true +rayon.workspace = true serde.workspace = true static_assertions.workspace = true thiserror.workspace = true diff --git a/fidget/benches/mesh.rs b/fidget/benches/mesh.rs index 45eb63dc..500b4d69 100644 --- a/fidget/benches/mesh.rs +++ b/fidget/benches/mesh.rs @@ -1,7 +1,7 @@ use criterion::{ black_box, criterion_group, criterion_main, BenchmarkId, Criterion, }; -use fidget::render::ThreadCount; +use fidget::mesh::ThreadCount; const COLONNADE: &str = include_str!("../../models/colonnade.vm"); diff --git a/fidget/benches/render.rs b/fidget/benches/render.rs index 48f51bc5..5acd4cbd 100644 --- a/fidget/benches/render.rs +++ b/fidget/benches/render.rs @@ -1,7 +1,7 @@ use criterion::{ black_box, criterion_group, criterion_main, BenchmarkId, Criterion, }; -use fidget::render::{ImageSize, RenderHints, ThreadCount}; +use fidget::render::{ImageSize, RenderHints, ThreadPool}; const PROSPERO: &str = include_str!("../../models/prospero.vm"); @@ -53,16 +53,28 @@ pub fn prospero_thread_sweep(c: &mut Criterion) { let mut group = c.benchmark_group("speed vs threads (prospero, 2d) (1024 x 1024)"); - for threads in std::iter::once(ThreadCount::One).chain( - [1, 2, 4, 8, 16].map(|i| ThreadCount::Many(i.try_into().unwrap())), - ) { + let pools = [1, 2, 4, 8, 16].map(|i| { + rayon::ThreadPoolBuilder::new() + .num_threads(i) + .build() + .unwrap() + }); + for threads in [None, Some(ThreadPool::Global)] + .into_iter() + .chain(pools.iter().map(|p| Some(ThreadPool::Custom(p)))) + { + let name = match &threads { + None => "-".to_string(), + Some(ThreadPool::Custom(i)) => i.current_num_threads().to_string(), + Some(ThreadPool::Global) => "N".to_string(), + }; let cfg = &fidget::render::ImageRenderConfig { image_size: ImageSize::from(1024), tile_sizes: fidget::vm::VmFunction::tile_sizes_2d(), - threads, + threads: threads.clone(), ..Default::default() }; - group.bench_function(BenchmarkId::new("vm", threads), move |b| { + group.bench_function(BenchmarkId::new("vm", &name), move |b| { b.iter(|| { let tape = shape_vm.clone(); black_box(cfg.run::<_, fidget::render::BitRenderMode>(tape)) @@ -76,7 +88,7 @@ pub fn prospero_thread_sweep(c: &mut Criterion) { threads, ..Default::default() }; - group.bench_function(BenchmarkId::new("jit", threads), move |b| { + group.bench_function(BenchmarkId::new("jit", &name), move |b| { b.iter(|| { let tape = shape_jit.clone(); black_box(cfg.run::<_, fidget::render::BitRenderMode>(tape)) diff --git a/fidget/src/core/eval/mod.rs b/fidget/src/core/eval/mod.rs index f8d05f2a..945e3e8a 100644 --- a/fidget/src/core/eval/mod.rs +++ b/fidget/src/core/eval/mod.rs @@ -83,7 +83,7 @@ pub trait Function: Send + Sync + Clone { /// This type must implement [`Eq`] so that traces can be compared; calling /// [`Function::simplify`] with traces that compare equal should produce an /// identical result and may be cached. - type Trace: Clone + Eq + Send + Trace; + type Trace: Clone + Eq + Send + Sync + Trace; /// Associated type for storage used by the function itself type Storage: Default + Send; diff --git a/fidget/src/core/vm/mod.rs b/fidget/src/core/vm/mod.rs index dc4715ee..32d5e2d1 100644 --- a/fidget/src/core/vm/mod.rs +++ b/fidget/src/core/vm/mod.rs @@ -220,11 +220,11 @@ impl Function for GenericVmFunction { impl RenderHints for GenericVmFunction { fn tile_sizes_3d() -> TileSizes { - TileSizes::new(&[256, 128, 64, 32, 16, 8]).unwrap() + TileSizes::new(&[128, 64, 32, 16, 8]).unwrap() } fn tile_sizes_2d() -> TileSizes { - TileSizes::new(&[256, 128, 64, 32, 16, 8]).unwrap() + TileSizes::new(&[128, 32, 8]).unwrap() } } diff --git a/fidget/src/mesh/mod.rs b/fidget/src/mesh/mod.rs index 456628e8..67dc5cec 100644 --- a/fidget/src/mesh/mod.rs +++ b/fidget/src/mesh/mod.rs @@ -48,7 +48,71 @@ mod octree; mod output; mod qef; -use crate::render::{ThreadCount, View3}; +use crate::render::View3; + +/// Number of threads to use during evaluation +/// +/// In a WebAssembly build, only the [`ThreadCount::One`] variant is available. +#[derive(Copy, Clone, Debug)] +pub enum ThreadCount { + /// Perform all evaluation in the main thread, not spawning any workers + One, + + /// Spawn some number of worker threads for evaluation + /// + /// This can be set to `1`, in which case a single worker thread will be + /// spawned; this is different from doing work in the main thread, but not + /// particularly useful! + #[cfg(not(target_arch = "wasm32"))] + Many(std::num::NonZeroUsize), +} + +#[cfg(not(target_arch = "wasm32"))] +impl From for ThreadCount { + fn from(v: std::num::NonZeroUsize) -> Self { + match v.get() { + 0 => unreachable!(), + 1 => ThreadCount::One, + _ => ThreadCount::Many(v), + } + } +} + +/// Single-threaded mode is shown as `-`; otherwise, an integer +impl std::fmt::Display for ThreadCount { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ThreadCount::One => write!(f, "-"), + #[cfg(not(target_arch = "wasm32"))] + ThreadCount::Many(n) => write!(f, "{n}"), + } + } +} + +impl ThreadCount { + /// Gets the thread count + /// + /// Returns `None` if we are required to be single-threaded + pub fn get(&self) -> Option { + match self { + ThreadCount::One => None, + #[cfg(not(target_arch = "wasm32"))] + ThreadCount::Many(v) => Some(v.get()), + } + } +} + +impl Default for ThreadCount { + #[cfg(target_arch = "wasm32")] + fn default() -> Self { + Self::One + } + + #[cfg(not(target_arch = "wasm32"))] + fn default() -> Self { + Self::Many(std::num::NonZeroUsize::new(8).unwrap()) + } +} #[cfg(not(target_arch = "wasm32"))] mod mt; diff --git a/fidget/src/mesh/octree.rs b/fidget/src/mesh/octree.rs index d74cce3b..ce1640fc 100644 --- a/fidget/src/mesh/octree.rs +++ b/fidget/src/mesh/octree.rs @@ -8,11 +8,11 @@ use super::{ gen::CELL_TO_VERT_TO_EDGES, qef::QuadraticErrorSolver, types::{Axis, Corner, Edge}, - Mesh, Settings, + Mesh, Settings, ThreadCount, }; use crate::{ eval::{BulkEvaluator, Function, TracingEvaluator}, - render::{RenderHints, ThreadCount}, + render::RenderHints, shape::{Shape, ShapeBulkEval, ShapeTape, ShapeTracingEval, ShapeVars}, types::Grad, }; @@ -1218,7 +1218,7 @@ mod test { use crate::{ context::Tree, mesh::types::{Edge, X, Y, Z}, - render::{ThreadCount, View3}, + render::View3, shape::EzShape, var::Var, vm::{VmFunction, VmShape}, diff --git a/fidget/src/render/config.rs b/fidget/src/render/config.rs index 43c52e64..4e29a3bd 100644 --- a/fidget/src/render/config.rs +++ b/fidget/src/render/config.rs @@ -4,74 +4,21 @@ use crate::{ shape::{Shape, ShapeVars}, }; use nalgebra::{Const, Matrix3, Matrix4, OPoint, Point2, Vector2}; -use std::sync::atomic::{AtomicUsize, Ordering}; -/// Number of threads to use during evaluation +/// Thread pool to use for multithreaded rendering /// -/// In a WebAssembly build, only the [`ThreadCount::One`] variant is available. -#[derive(Copy, Clone, Debug)] -pub enum ThreadCount { - /// Perform all evaluation in the main thread, not spawning any workers - One, - - /// Spawn some number of worker threads for evaluation - /// - /// This can be set to `1`, in which case a single worker thread will be - /// spawned; this is different from doing work in the main thread, but not - /// particularly useful! - #[cfg(not(target_arch = "wasm32"))] - Many(std::num::NonZeroUsize), -} - -#[cfg(not(target_arch = "wasm32"))] -impl From for ThreadCount { - fn from(v: std::num::NonZeroUsize) -> Self { - match v.get() { - 0 => unreachable!(), - 1 => ThreadCount::One, - _ => ThreadCount::Many(v), - } - } -} - -/// Single-threaded mode is shown as `-`; otherwise, an integer -impl std::fmt::Display for ThreadCount { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - ThreadCount::One => write!(f, "-"), - #[cfg(not(target_arch = "wasm32"))] - ThreadCount::Many(n) => write!(f, "{n}"), - } - } -} - -impl ThreadCount { - /// Gets the thread count - /// - /// Returns `None` if we are required to be single-threaded - pub fn get(&self) -> Option { - match self { - ThreadCount::One => None, - #[cfg(not(target_arch = "wasm32"))] - ThreadCount::Many(v) => Some(v.get()), - } - } -} - -impl Default for ThreadCount { - #[cfg(target_arch = "wasm32")] - fn default() -> Self { - Self::One - } - - #[cfg(not(target_arch = "wasm32"))] - fn default() -> Self { - Self::Many(std::num::NonZeroUsize::new(8).unwrap()) - } +/// Most users will use the global Rayon pool, but it's possible to provide your +/// own as well. +#[derive(Clone)] +pub enum ThreadPool<'a> { + /// User-provided pool + Custom(&'a rayon::ThreadPool), + /// Global Rayon pool + Global, } /// Settings for 2D rendering -pub struct ImageRenderConfig { +pub struct ImageRenderConfig<'a> { /// Render size pub image_size: ImageSize, @@ -85,22 +32,25 @@ pub struct ImageRenderConfig { /// to select this based on evaluator type. pub tile_sizes: TileSizes, - /// Number of worker threads - pub threads: ThreadCount, + /// Thread pool to use for rendering + /// + /// If this is `None`, then rendering is done in a single thread; otherwise, + /// the provided pool is used. + pub threads: Option>, } -impl Default for ImageRenderConfig { +impl Default for ImageRenderConfig<'_> { fn default() -> Self { Self { image_size: ImageSize::from(512), tile_sizes: TileSizes::new(&[128, 32, 8]).unwrap(), view: View2::default(), - threads: ThreadCount::default(), + threads: Some(ThreadPool::Global), } } } -impl ImageRenderConfig { +impl ImageRenderConfig<'_> { /// Render a shape in 2D using this configuration pub fn run( &self, @@ -125,7 +75,7 @@ impl ImageRenderConfig { } /// Settings for 3D rendering -pub struct VoxelRenderConfig { +pub struct VoxelRenderConfig<'a> { /// Render size /// /// The resulting image will have the given width and height; depth sets the @@ -143,23 +93,26 @@ pub struct VoxelRenderConfig { /// to select this based on evaluator type. pub tile_sizes: TileSizes, - /// Number of worker threads - pub threads: ThreadCount, + /// Thread pool to use for rendering + /// + /// If this is `None`, then rendering is done in a single thread; otherwise, + /// the provided pool is used. + pub threads: Option>, } -impl Default for VoxelRenderConfig { +impl Default for VoxelRenderConfig<'_> { fn default() -> Self { Self { image_size: VoxelSize::from(512), tile_sizes: TileSizes::new(&[128, 64, 32, 16, 8]).unwrap(), view: View3::default(), - threads: ThreadCount::default(), + threads: Some(ThreadPool::Global), } } } -impl VoxelRenderConfig { +impl VoxelRenderConfig<'_> { /// Render a shape in 3D using this configuration /// /// Returns a tuple of heightmap, RGB image. @@ -183,11 +136,6 @@ impl VoxelRenderConfig { pub fn mat(&self) -> Matrix4 { self.view.world_to_model() * self.image_size.screen_to_world() } - - /// Returns the data offset of a row within a subtile - pub(crate) fn tile_row_offset(&self, tile: Tile<3>, row: usize) -> usize { - self.tile_sizes.pixel_offset(tile.add(Vector2::new(0, row))) - } } //////////////////////////////////////////////////////////////////////////////// @@ -214,25 +162,6 @@ impl Tile { } } -/// Worker queue -pub(crate) struct Queue { - index: AtomicUsize, - tiles: Vec>, -} - -impl Queue { - pub fn new(tiles: Vec>) -> Self { - Self { - index: AtomicUsize::new(0), - tiles, - } - } - pub fn next(&self) -> Option> { - let index = self.index.fetch_add(1, Ordering::Relaxed); - self.tiles.get(index).cloned() - } -} - //////////////////////////////////////////////////////////////////////////////// #[cfg(test)] diff --git a/fidget/src/render/mod.rs b/fidget/src/render/mod.rs index 15b76efb..33eb7342 100644 --- a/fidget/src/render/mod.rs +++ b/fidget/src/render/mod.rs @@ -14,7 +14,7 @@ mod render2d; mod render3d; mod view; -pub use config::{ImageRenderConfig, ThreadCount, VoxelRenderConfig}; +pub use config::{ImageRenderConfig, ThreadPool, VoxelRenderConfig}; pub use region::{ImageSize, RegionSize, VoxelSize}; pub use view::{View2, View3}; diff --git a/fidget/src/render/render2d.rs b/fidget/src/render/render2d.rs index 04f33086..56344391 100644 --- a/fidget/src/render/render2d.rs +++ b/fidget/src/render/render2d.rs @@ -2,12 +2,15 @@ use super::RenderHandle; use crate::{ eval::Function, - render::config::{ImageRenderConfig, Queue, Tile}, - render::ThreadCount, + render::{ + config::{ImageRenderConfig, ThreadPool, Tile}, + TileSizes, + }, shape::{Shape, ShapeBulkEval, ShapeTracingEval, ShapeVars}, types::Interval, }; use nalgebra::{Point2, Vector2}; +use rayon::prelude::*; //////////////////////////////////////////////////////////////////////////////// @@ -203,7 +206,7 @@ impl Scratch { /// Per-thread worker struct Worker<'a, F: Function, M: RenderMode> { - config: &'a ImageRenderConfig, + tile_sizes: &'a TileSizes, scratch: Scratch, eval_float_slice: ShapeBulkEval, @@ -225,6 +228,17 @@ struct Worker<'a, F: Function, M: RenderMode> { } impl Worker<'_, F, M> { + fn render_tile( + &mut self, + shape: &mut RenderHandle, + vars: &ShapeVars, + tile: Tile<2>, + ) -> Vec { + self.image = vec![M::Output::default(); self.tile_sizes[0].pow(2)]; + self.render_tile_recurse(shape, vars, 0, tile); + std::mem::take(&mut self.image) + } + fn render_tile_recurse( &mut self, shape: &mut RenderHandle, @@ -232,7 +246,7 @@ impl Worker<'_, F, M> { depth: usize, tile: Tile<2>, ) { - let tile_size = self.config.tile_sizes[depth]; + let tile_size = self.tile_sizes[depth]; // Find the interval bounds of the region, in screen coordinates let base = Point2::from(tile.corner).cast::(); @@ -250,7 +264,6 @@ impl Worker<'_, F, M> { IntervalAction::Fill(fill) => { for y in 0..tile_size { let start = self - .config .tile_sizes .pixel_offset(tile.add(Vector2::new(0, y))); self.image[start..][..tile_size].fill(fill); @@ -274,7 +287,6 @@ impl Worker<'_, F, M> { let v1 = vs[2] * (1.0 - y_frac) + vs[3] * y_frac; let mut i = self - .config .tile_sizes .pixel_offset(tile.add(Vector2::new(0, y))); for x in 0..tile_size { @@ -303,7 +315,7 @@ impl Worker<'_, F, M> { shape }; - if let Some(next_tile_size) = self.config.tile_sizes.get(depth + 1) { + if let Some(next_tile_size) = self.tile_sizes.get(depth + 1) { let n = tile_size / next_tile_size; for j in 0..n { for i in 0..n { @@ -351,10 +363,7 @@ impl Worker<'_, F, M> { let mut index = 0; for j in 0..tile_size { - let o = self - .config - .tile_sizes - .pixel_offset(tile.add(Vector2::new(0, j))); + let o = self.tile_sizes.pixel_offset(tile.add(Vector2::new(0, j))); for i in 0..tile_size { self.image[o + i] = M::pixel(out[index]); index += 1; @@ -365,37 +374,6 @@ impl Worker<'_, F, M> { //////////////////////////////////////////////////////////////////////////////// -fn worker( - mut shape: RenderHandle, - vars: &ShapeVars, - queue: &Queue<2>, - config: &ImageRenderConfig, -) -> Vec<(Tile<2>, Vec)> { - let mut out = vec![]; - let scratch = Scratch::new(config.tile_sizes.last().pow(2)); - - let mut w: Worker = Worker { - scratch, - image: vec![], - config, - eval_float_slice: Default::default(), - eval_interval: Default::default(), - tape_storage: vec![], - shape_storage: vec![], - workspace: Default::default(), - }; - - while let Some(tile) = queue.next() { - w.image = vec![M::Output::default(); config.tile_sizes[0].pow(2)]; - w.render_tile_recurse(&mut shape, vars, 0, tile); - let pixels = std::mem::take(&mut w.image); - out.push((tile, pixels)) - } - out -} - -//////////////////////////////////////////////////////////////////////////////// - /// Renders the given tape into a 2D image at Z = 0 according to the provided /// configuration. /// @@ -437,30 +415,52 @@ fn render_inner( } } - let queue = Queue::new(tiles); - let mut rh = RenderHandle::new(shape); let _ = rh.i_tape(&mut vec![]); // populate i_tape before cloning + let init = || { + let scratch = Scratch::new(config.tile_sizes.last().pow(2)); + let rh = rh.clone(); + + let worker = Worker:: { + scratch, + image: vec![], + tile_sizes: &config.tile_sizes, + eval_float_slice: Default::default(), + eval_interval: Default::default(), + tape_storage: vec![], + shape_storage: vec![], + workspace: Default::default(), + }; + (worker, rh) + }; - let out: Vec<_> = match config.threads { - ThreadCount::One => worker::(rh, vars, &queue, config) - .into_iter() - .collect(), - - #[cfg(not(target_arch = "wasm32"))] - ThreadCount::Many(v) => std::thread::scope(|s| { - let mut handles = vec![]; - for _ in 0..v.get() { - let rh = rh.clone(); - handles - .push(s.spawn(|| worker::(rh, vars, &queue, config))); - } - let mut out = vec![]; - for h in handles { - out.extend(h.join().unwrap().into_iter()); + let out: Vec<_> = match &config.threads { + None => { + let (mut worker, mut rh) = init(); + tiles + .into_iter() + .map(|tile| { + let pixels = worker.render_tile(&mut rh, vars, tile); + (tile, pixels) + }) + .collect() + } + + Some(p) => { + let run = || { + tiles + .into_par_iter() + .map_init(init, |(w, rh), tile| { + let pixels = w.render_tile(rh, vars, tile); + (tile, pixels) + }) + .collect() + }; + match p { + ThreadPool::Custom(p) => p.install(run), + ThreadPool::Global => run(), } - out - }), + } }; let mut image = vec![M::Output::default(); width * height]; diff --git a/fidget/src/render/render3d.rs b/fidget/src/render/render3d.rs index d9029e9e..ef79c0c4 100644 --- a/fidget/src/render/render3d.rs +++ b/fidget/src/render/render3d.rs @@ -2,13 +2,16 @@ use super::RenderHandle; use crate::{ eval::Function, - render::config::{Queue, ThreadCount, Tile, VoxelRenderConfig}, + render::{ + config::{ThreadPool, Tile, VoxelRenderConfig}, + TileSizes, VoxelSize, + }, shape::{Shape, ShapeBulkEval, ShapeTracingEval, ShapeVars}, types::{Grad, Interval}, }; -use nalgebra::{Point3, Vector2, Vector3}; -use std::collections::HashMap; +use nalgebra::{Point2, Point3, Vector2, Vector3}; +use rayon::prelude::*; //////////////////////////////////////////////////////////////////////////////// @@ -47,7 +50,8 @@ impl Scratch { //////////////////////////////////////////////////////////////////////////////// struct Worker<'a, F: Function> { - config: &'a VoxelRenderConfig, + tile_sizes: &'a TileSizes, + image_size: VoxelSize, /// Reusable workspace for evaluation, to minimize allocation scratch: Scratch, @@ -66,21 +70,54 @@ struct Worker<'a, F: Function> { } impl Worker<'_, F> { + fn render_tile( + &mut self, + shape: &mut RenderHandle, + vars: &ShapeVars, + tile: Tile<2>, + ) -> (Vec, Vec<[u8; 3]>) { + // Prepare local tile data to fill out + self.depth = vec![0; self.tile_sizes[0].pow(2)]; + self.color = vec![[0u8; 3]; self.tile_sizes[0].pow(2)]; + let root_tile_size = self.tile_sizes[0]; + for k in (0..self.image_size[2].div_ceil(root_tile_size as u32)).rev() { + let tile = Tile::new(Point3::new( + tile.corner.x, + tile.corner.y, + k as usize * root_tile_size, + )); + if !self.render_tile_recurse(shape, vars, 0, tile) { + break; + } + } + let depth = std::mem::take(&mut self.depth); + let color = std::mem::take(&mut self.color); + (depth, color) + } + + /// Returns the data offset of a row within a subtile + pub(crate) fn tile_row_offset(&self, tile: Tile<3>, row: usize) -> usize { + self.tile_sizes.pixel_offset(tile.add(Vector2::new(0, row))) + } + + /// Render a single tile + /// + /// Returns `true` if we should keep rendering, `false` otherwise fn render_tile_recurse( &mut self, shape: &mut RenderHandle, vars: &ShapeVars, depth: usize, tile: Tile<3>, - ) { + ) -> bool { // Early exit if every single pixel is filled - let tile_size = self.config.tile_sizes[depth]; + let tile_size = self.tile_sizes[depth]; let fill_z = (tile.corner[2] + tile_size + 1).try_into().unwrap(); if (0..tile_size).all(|y| { - let i = self.config.tile_row_offset(tile, y); + let i = self.tile_row_offset(tile, y); (0..tile_size).all(|x| self.depth[i + x] >= fill_z) }) { - return; + return false; } let base = Point3::from(tile.corner).cast::(); @@ -97,14 +134,14 @@ impl Worker<'_, F> { // `data_interval` to scratch memory for reuse. if i.upper() < 0.0 { for y in 0..tile_size { - let i = self.config.tile_row_offset(tile, y); + let i = self.tile_row_offset(tile, y); for x in 0..tile_size { self.depth[i + x] = self.depth[i + x].max(fill_z); } } - return; + return false; // completely full, stop rendering } else if i.lower() > 0.0 { - return; + return true; // complete empty, keep going } // Calculate a simplified tape based on the trace @@ -120,7 +157,7 @@ impl Worker<'_, F> { }; // Recurse! - if let Some(next_tile_size) = self.config.tile_sizes.get(depth + 1) { + if let Some(next_tile_size) = self.tile_sizes.get(depth + 1) { let n = tile_size / next_tile_size; for j in 0..n { @@ -142,6 +179,7 @@ impl Worker<'_, F> { self.render_tile_pixels(sub_tape, vars, tile_size, tile); }; // TODO recycle something here? + true // keep going } fn render_tile_pixels( @@ -161,10 +199,7 @@ impl Worker<'_, F> { let i = xy % tile_size; let j = xy / tile_size; - let o = self - .config - .tile_sizes - .pixel_offset(tile.add(Vector2::new(i, j))); + let o = self.tile_sizes.pixel_offset(tile.add(Vector2::new(i, j))); // Skip pixels which are behind the image let zmax = (tile.corner[2] + tile_size).try_into().unwrap(); @@ -230,10 +265,7 @@ impl Worker<'_, F> { let k = tile_size - 1 - k; // Set the depth of the pixel - let o = self - .config - .tile_sizes - .pixel_offset(tile.add(Vector2::new(i, j))); + let o = self.tile_sizes.pixel_offset(tile.add(Vector2::new(i, j))); let z = (tile.corner[2] + k + 1).try_into().unwrap(); assert!(self.depth[o] < z); self.depth[o] = z; @@ -277,86 +309,6 @@ impl Worker<'_, F> { //////////////////////////////////////////////////////////////////////////////// -#[derive(Default)] -struct Image { - depth: Vec, - color: Vec<[u8; 3]>, -} - -impl Image { - fn new(size: usize) -> Self { - Self { - depth: vec![0; size.pow(2)], - color: vec![[0; 3]; size.pow(2)], - } - } -} - -//////////////////////////////////////////////////////////////////////////////// - -fn worker( - mut shape: RenderHandle, - vars: &ShapeVars, - queues: &[Queue<3>], - mut index: usize, - config: &VoxelRenderConfig, -) -> HashMap<[usize; 2], Image> { - let mut out = HashMap::new(); - - // Calculate maximum evaluation buffer size - let buf_size = config.tile_sizes.last(); - let scratch = Scratch::new(buf_size); - let mut w: Worker = Worker { - scratch, - depth: vec![], - color: vec![], - config, - - eval_float_slice: Default::default(), - eval_interval: Default::default(), - eval_grad_slice: Default::default(), - - tape_storage: vec![], - shape_storage: vec![], - workspace: Default::default(), - }; - - // Every thread has a set of tiles assigned to it, which are in Z-sorted - // order (to encourage culling). Once the thread finishes its tiles, it - // begins stealing from other thread queues; if every single thread queue is - // empty, then we return. - let start = index; - loop { - while let Some(tile) = queues[index].next() { - let image = out - .remove(&[tile.corner[0], tile.corner[1]]) - .unwrap_or_else(|| Image::new(config.tile_sizes[0])); - - // Prepare to render, allocating space for a tile - w.depth = image.depth; - w.color = image.color; - w.render_tile_recurse(&mut shape, vars, 0, tile); - - // Steal the tile, replacing it with an empty vec - let depth = std::mem::take(&mut w.depth); - let color = std::mem::take(&mut w.color); - out.insert( - [tile.corner[0], tile.corner[1]], - Image { depth, color }, - ); - } - // Move on to the next thread's queue - index = (index + 1) % queues.len(); - if index == start { - break; - } - } - - out -} - -//////////////////////////////////////////////////////////////////////////////// - /// Renders the given tape into a 3D image according to the provided /// configuration. /// @@ -376,70 +328,83 @@ pub fn render( let t = config.tile_sizes[0]; let width = config.image_size[0] as usize; let height = config.image_size[1] as usize; - let depth = config.image_size[2] as usize; for i in 0..width.div_ceil(t) { for j in 0..height.div_ceil(t) { - for k in (0..depth.div_ceil(t)).rev() { - tiles.push(Tile::new(Point3::new( - i * config.tile_sizes[0], - j * config.tile_sizes[0], - k * config.tile_sizes[0], - ))); - } + tiles.push(Tile::new(Point2::new( + i * config.tile_sizes[0], + j * config.tile_sizes[0], + ))); } } - let threads = config.threads.get().unwrap_or(1); - let tiles_per_thread = (tiles.len() / threads).max(1); - let mut tile_queues = vec![]; - for ts in tiles.chunks(tiles_per_thread) { - tile_queues.push(Queue::new(ts.to_vec())); - } - tile_queues.resize_with(threads, || Queue::new(vec![])); - let mut rh = RenderHandle::new(shape); let _ = rh.i_tape(&mut vec![]); // populate i_tape before cloning + let init = || { + let rh = rh.clone(); + let buf_size = config.tile_sizes.last(); + let scratch = Scratch::new(buf_size); + let worker: Worker = Worker { + scratch, + depth: vec![], + color: vec![], + tile_sizes: &config.tile_sizes, + image_size: config.image_size, + + eval_float_slice: Default::default(), + eval_interval: Default::default(), + eval_grad_slice: Default::default(), + + tape_storage: vec![], + shape_storage: vec![], + workspace: Default::default(), + }; + (worker, rh) + }; + // Special-case for single-threaded operation, to give simpler backtraces - let out: Vec<_> = match config.threads { - ThreadCount::One => { - worker::(rh, vars, tile_queues.as_slice(), 0, config) + let out: Vec<_> = match &config.threads { + None => { + let (mut worker, mut rh) = init(); + tiles .into_iter() + .map(|tile| { + let pixels = worker.render_tile(&mut rh, vars, tile); + (tile, pixels) + }) .collect() } - #[cfg(not(target_arch = "wasm32"))] - ThreadCount::Many(threads) => std::thread::scope(|s| { - let config = &config; - let mut handles = vec![]; - let queues = tile_queues.as_slice(); - for i in 0..threads.get() { - let rh = rh.clone(); - handles.push( - s.spawn(move || worker::(rh, vars, queues, i, config)), - ); - } - let mut out = vec![]; - for h in handles { - out.extend(h.join().unwrap().into_iter()); + Some(p) => { + let run = || { + tiles + .into_par_iter() + .map_init(init, |(w, rh), tile| { + let pixels = w.render_tile(rh, vars, tile); + (tile, pixels) + }) + .collect() + }; + match p { + ThreadPool::Custom(p) => p.install(run), + ThreadPool::Global => run(), } - out - }), + } }; let mut image_depth = vec![0; width * height]; let mut image_color = vec![[0; 3]; width * height]; - for (tile, patch) in out.iter() { + for (tile, (depth, color)) in out { let mut index = 0; for j in 0..config.tile_sizes[0] { - let y = j + tile[1]; + let y = j + tile.corner.y; for i in 0..config.tile_sizes[0] { - let x = i + tile[0]; + let x = i + tile.corner.x; if x < width && y < height { let o = y * width + x; - if patch.depth[index] >= image_depth[o] { - image_color[o] = patch.color[index]; - image_depth[o] = patch.depth[index]; + if depth[index] >= image_depth[o] { + image_color[o] = color[index]; + image_depth[o] = depth[index]; } } index += 1; diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 97413789..5b3628d8 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -20,7 +20,7 @@ approx = { version = "0.5" } bytemuck = { version = "1", default-features = false, features = ["derive", "extern_crate_alloc"] } clap = { version = "4", features = ["derive"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "suggestions", "usage"] } -crossbeam-utils = { version = "0.8" } +either = { version = "1", default-features = false, features = ["use_std"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } num-traits = { version = "0.2", features = ["i128"] } once_cell = { version = "1" }