diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e6fd184..f90e7980 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,16 @@
   between threads, but tapes were _already_ using an `Arc<..>` under the hood.
 - Changed `Tape::recycle` from returning a `Storage` to returning an
   `Option<Storage>`, as tapes may now be shared between threads.
+- Use Rayon for 2D and 3D rasterization
+    - The `threads` member of `VoxelRenderConfig` and `ImageRenderConfig` is now
+      a `Option<ThreadPool>`, which can be `None` (use a single thread),
+      `Some(ThreadPool::Global)` (use the global Rayon pool), or
+      `Some(ThreadPool::Custom(..))` (use a user-provided pool)
+    - This is a step towards WebAssembly multithreading, using
+      `wasm-bindgen-rayon`.
+    - `ThreadCount` is moved to `fidget::mesh`, because that's the only place
+      it's now used
+        - The plan is to switch to Rayon for meshing as well, eventually
 
 # 0.3.3
 - `Function` and evaluator types now produce multiple outputs
diff --git a/Cargo.lock b/Cargo.lock
index b620cbbd..4542cb51 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "ab_glyph"
@@ -851,6 +851,7 @@ dependencies = [
  "num-traits",
  "ordered-float",
  "rand",
+ "rayon",
  "rhai",
  "serde",
  "static_assertions",
@@ -870,6 +871,7 @@ dependencies = [
  "image",
  "log",
  "nalgebra",
+ "rayon",
  "workspace-hack",
 ]
 
@@ -1296,7 +1298,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -2029,9 +2031,9 @@ checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
 
 [[package]]
 name = "rayon"
-version = "1.8.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
 dependencies = [
  "either",
  "rayon-core",
@@ -3041,7 +3043,7 @@ dependencies = [
  "bytemuck",
  "clap",
  "clap_builder",
- "crossbeam-utils",
+ "either",
  "getrandom",
  "libc",
  "log",
diff --git a/Cargo.toml b/Cargo.toml
index d3914821..611b1a3e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ notify = "7.0"
 num-traits = "0.2"
 ordered-float = "4"
 rand = "0.8.5"
+rayon = "1.10"
 rhai = { version = "1.17", features = ["sync"] }
 serde = { version = "1.0", features = ["derive", "rc"] }
 static_assertions = "1"
diff --git a/demos/cli/Cargo.toml b/demos/cli/Cargo.toml
index b477bec6..1b266e47 100644
--- a/demos/cli/Cargo.toml
+++ b/demos/cli/Cargo.toml
@@ -11,6 +11,7 @@ env_logger.workspace = true
 image.workspace = true
 log.workspace = true
 nalgebra.workspace = true
+rayon.workspace = true
 
 fidget.path = "../../fidget"
 workspace-hack = { version = "0.1", path = "../../workspace-hack" }
diff --git a/demos/cli/src/main.rs b/demos/cli/src/main.rs
index 08e6d7c7..3eed9fa8 100644
--- a/demos/cli/src/main.rs
+++ b/demos/cli/src/main.rs
@@ -73,8 +73,8 @@ struct ImageSettings {
     eval: EvalMode,
 
     /// Number of threads to use
-    #[clap(short, long, default_value_t = NonZeroUsize::new(8).unwrap())]
-    threads: NonZeroUsize,
+    #[clap(short, long)]
+    threads: Option<NonZeroUsize>,
 
     /// Number of times to render (for benchmarking)
     #[clap(short = 'N', default_value_t = 1)]
@@ -119,10 +119,24 @@ fn run3d<F: fidget::eval::Function + fidget::render::RenderHints>(
     if !isometric {
         *mat.matrix_mut().get_mut((3, 2)).unwrap() = 0.3;
     }
+    let pool: Option<rayon::ThreadPool>;
+    let threads = match settings.threads {
+        Some(n) if n.get() == 1 => None,
+        Some(n) => {
+            pool = Some(
+                rayon::ThreadPoolBuilder::new()
+                    .num_threads(n.get())
+                    .build()
+                    .unwrap(),
+            );
+            pool.as_ref().map(fidget::render::ThreadPool::Custom)
+        }
+        None => Some(fidget::render::ThreadPool::Global),
+    };
     let cfg = fidget::render::VoxelRenderConfig {
         image_size: fidget::render::VoxelSize::from(settings.size),
         tile_sizes: F::tile_sizes_3d(),
-        threads: settings.threads.into(),
+        threads,
         ..Default::default()
     };
     let shape = shape.apply_transform(mat.into());
@@ -197,10 +211,24 @@ fn run2d<F: fidget::eval::Function + fidget::render::RenderHints>(
             .flat_map(|i| i.into_iter())
             .collect()
     } else {
+        let pool: Option<rayon::ThreadPool>;
+        let threads = match settings.threads {
+            Some(n) if n.get() == 1 => None,
+            Some(n) => {
+                pool = Some(
+                    rayon::ThreadPoolBuilder::new()
+                        .num_threads(n.get())
+                        .build()
+                        .unwrap(),
+                );
+                pool.as_ref().map(fidget::render::ThreadPool::Custom)
+            }
+            None => Some(fidget::render::ThreadPool::Global),
+        };
         let cfg = fidget::render::ImageRenderConfig {
             image_size: fidget::render::ImageSize::from(settings.size),
             tile_sizes: F::tile_sizes_2d(),
-            threads: settings.threads.into(),
+            threads,
             ..Default::default()
         };
         if sdf {
diff --git a/fidget/Cargo.toml b/fidget/Cargo.toml
index 6a32313c..baf42a24 100644
--- a/fidget/Cargo.toml
+++ b/fidget/Cargo.toml
@@ -18,6 +18,7 @@ nalgebra.workspace = true
 num-traits.workspace = true
 ordered-float.workspace = true
 rand.workspace = true
+rayon.workspace = true
 serde.workspace = true
 static_assertions.workspace = true
 thiserror.workspace = true
diff --git a/fidget/benches/mesh.rs b/fidget/benches/mesh.rs
index 45eb63dc..500b4d69 100644
--- a/fidget/benches/mesh.rs
+++ b/fidget/benches/mesh.rs
@@ -1,7 +1,7 @@
 use criterion::{
     black_box, criterion_group, criterion_main, BenchmarkId, Criterion,
 };
-use fidget::render::ThreadCount;
+use fidget::mesh::ThreadCount;
 
 const COLONNADE: &str = include_str!("../../models/colonnade.vm");
 
diff --git a/fidget/benches/render.rs b/fidget/benches/render.rs
index 48f51bc5..5acd4cbd 100644
--- a/fidget/benches/render.rs
+++ b/fidget/benches/render.rs
@@ -1,7 +1,7 @@
 use criterion::{
     black_box, criterion_group, criterion_main, BenchmarkId, Criterion,
 };
-use fidget::render::{ImageSize, RenderHints, ThreadCount};
+use fidget::render::{ImageSize, RenderHints, ThreadPool};
 
 const PROSPERO: &str = include_str!("../../models/prospero.vm");
 
@@ -53,16 +53,28 @@ pub fn prospero_thread_sweep(c: &mut Criterion) {
 
     let mut group =
         c.benchmark_group("speed vs threads (prospero, 2d) (1024 x 1024)");
-    for threads in std::iter::once(ThreadCount::One).chain(
-        [1, 2, 4, 8, 16].map(|i| ThreadCount::Many(i.try_into().unwrap())),
-    ) {
+    let pools = [1, 2, 4, 8, 16].map(|i| {
+        rayon::ThreadPoolBuilder::new()
+            .num_threads(i)
+            .build()
+            .unwrap()
+    });
+    for threads in [None, Some(ThreadPool::Global)]
+        .into_iter()
+        .chain(pools.iter().map(|p| Some(ThreadPool::Custom(p))))
+    {
+        let name = match &threads {
+            None => "-".to_string(),
+            Some(ThreadPool::Custom(i)) => i.current_num_threads().to_string(),
+            Some(ThreadPool::Global) => "N".to_string(),
+        };
         let cfg = &fidget::render::ImageRenderConfig {
             image_size: ImageSize::from(1024),
             tile_sizes: fidget::vm::VmFunction::tile_sizes_2d(),
-            threads,
+            threads: threads.clone(),
             ..Default::default()
         };
-        group.bench_function(BenchmarkId::new("vm", threads), move |b| {
+        group.bench_function(BenchmarkId::new("vm", &name), move |b| {
             b.iter(|| {
                 let tape = shape_vm.clone();
                 black_box(cfg.run::<_, fidget::render::BitRenderMode>(tape))
@@ -76,7 +88,7 @@ pub fn prospero_thread_sweep(c: &mut Criterion) {
                 threads,
                 ..Default::default()
             };
-            group.bench_function(BenchmarkId::new("jit", threads), move |b| {
+            group.bench_function(BenchmarkId::new("jit", &name), move |b| {
                 b.iter(|| {
                     let tape = shape_jit.clone();
                     black_box(cfg.run::<_, fidget::render::BitRenderMode>(tape))
diff --git a/fidget/src/core/eval/mod.rs b/fidget/src/core/eval/mod.rs
index f8d05f2a..945e3e8a 100644
--- a/fidget/src/core/eval/mod.rs
+++ b/fidget/src/core/eval/mod.rs
@@ -83,7 +83,7 @@ pub trait Function: Send + Sync + Clone {
     /// This type must implement [`Eq`] so that traces can be compared; calling
     /// [`Function::simplify`] with traces that compare equal should produce an
     /// identical result and may be cached.
-    type Trace: Clone + Eq + Send + Trace;
+    type Trace: Clone + Eq + Send + Sync + Trace;
 
     /// Associated type for storage used by the function itself
     type Storage: Default + Send;
diff --git a/fidget/src/core/vm/mod.rs b/fidget/src/core/vm/mod.rs
index dc4715ee..32d5e2d1 100644
--- a/fidget/src/core/vm/mod.rs
+++ b/fidget/src/core/vm/mod.rs
@@ -220,11 +220,11 @@ impl<const N: usize> Function for GenericVmFunction<N> {
 
 impl<const N: usize> RenderHints for GenericVmFunction<N> {
     fn tile_sizes_3d() -> TileSizes {
-        TileSizes::new(&[256, 128, 64, 32, 16, 8]).unwrap()
+        TileSizes::new(&[128, 64, 32, 16, 8]).unwrap()
     }
 
     fn tile_sizes_2d() -> TileSizes {
-        TileSizes::new(&[256, 128, 64, 32, 16, 8]).unwrap()
+        TileSizes::new(&[128, 32, 8]).unwrap()
     }
 }
 
diff --git a/fidget/src/mesh/mod.rs b/fidget/src/mesh/mod.rs
index 456628e8..67dc5cec 100644
--- a/fidget/src/mesh/mod.rs
+++ b/fidget/src/mesh/mod.rs
@@ -48,7 +48,71 @@ mod octree;
 mod output;
 mod qef;
 
-use crate::render::{ThreadCount, View3};
+use crate::render::View3;
+
+/// Number of threads to use during evaluation
+///
+/// In a WebAssembly build, only the [`ThreadCount::One`] variant is available.
+#[derive(Copy, Clone, Debug)]
+pub enum ThreadCount {
+    /// Perform all evaluation in the main thread, not spawning any workers
+    One,
+
+    /// Spawn some number of worker threads for evaluation
+    ///
+    /// This can be set to `1`, in which case a single worker thread will be
+    /// spawned; this is different from doing work in the main thread, but not
+    /// particularly useful!
+    #[cfg(not(target_arch = "wasm32"))]
+    Many(std::num::NonZeroUsize),
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+impl From<std::num::NonZeroUsize> for ThreadCount {
+    fn from(v: std::num::NonZeroUsize) -> Self {
+        match v.get() {
+            0 => unreachable!(),
+            1 => ThreadCount::One,
+            _ => ThreadCount::Many(v),
+        }
+    }
+}
+
+/// Single-threaded mode is shown as `-`; otherwise, an integer
+impl std::fmt::Display for ThreadCount {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ThreadCount::One => write!(f, "-"),
+            #[cfg(not(target_arch = "wasm32"))]
+            ThreadCount::Many(n) => write!(f, "{n}"),
+        }
+    }
+}
+
+impl ThreadCount {
+    /// Gets the thread count
+    ///
+    /// Returns `None` if we are required to be single-threaded
+    pub fn get(&self) -> Option<usize> {
+        match self {
+            ThreadCount::One => None,
+            #[cfg(not(target_arch = "wasm32"))]
+            ThreadCount::Many(v) => Some(v.get()),
+        }
+    }
+}
+
+impl Default for ThreadCount {
+    #[cfg(target_arch = "wasm32")]
+    fn default() -> Self {
+        Self::One
+    }
+
+    #[cfg(not(target_arch = "wasm32"))]
+    fn default() -> Self {
+        Self::Many(std::num::NonZeroUsize::new(8).unwrap())
+    }
+}
 
 #[cfg(not(target_arch = "wasm32"))]
 mod mt;
diff --git a/fidget/src/mesh/octree.rs b/fidget/src/mesh/octree.rs
index d74cce3b..ce1640fc 100644
--- a/fidget/src/mesh/octree.rs
+++ b/fidget/src/mesh/octree.rs
@@ -8,11 +8,11 @@ use super::{
     gen::CELL_TO_VERT_TO_EDGES,
     qef::QuadraticErrorSolver,
     types::{Axis, Corner, Edge},
-    Mesh, Settings,
+    Mesh, Settings, ThreadCount,
 };
 use crate::{
     eval::{BulkEvaluator, Function, TracingEvaluator},
-    render::{RenderHints, ThreadCount},
+    render::RenderHints,
     shape::{Shape, ShapeBulkEval, ShapeTape, ShapeTracingEval, ShapeVars},
     types::Grad,
 };
@@ -1218,7 +1218,7 @@ mod test {
     use crate::{
         context::Tree,
         mesh::types::{Edge, X, Y, Z},
-        render::{ThreadCount, View3},
+        render::View3,
         shape::EzShape,
         var::Var,
         vm::{VmFunction, VmShape},
diff --git a/fidget/src/render/config.rs b/fidget/src/render/config.rs
index 43c52e64..4e29a3bd 100644
--- a/fidget/src/render/config.rs
+++ b/fidget/src/render/config.rs
@@ -4,74 +4,21 @@ use crate::{
     shape::{Shape, ShapeVars},
 };
 use nalgebra::{Const, Matrix3, Matrix4, OPoint, Point2, Vector2};
-use std::sync::atomic::{AtomicUsize, Ordering};
 
-/// Number of threads to use during evaluation
+/// Thread pool to use for multithreaded rendering
 ///
-/// In a WebAssembly build, only the [`ThreadCount::One`] variant is available.
-#[derive(Copy, Clone, Debug)]
-pub enum ThreadCount {
-    /// Perform all evaluation in the main thread, not spawning any workers
-    One,
-
-    /// Spawn some number of worker threads for evaluation
-    ///
-    /// This can be set to `1`, in which case a single worker thread will be
-    /// spawned; this is different from doing work in the main thread, but not
-    /// particularly useful!
-    #[cfg(not(target_arch = "wasm32"))]
-    Many(std::num::NonZeroUsize),
-}
-
-#[cfg(not(target_arch = "wasm32"))]
-impl From<std::num::NonZeroUsize> for ThreadCount {
-    fn from(v: std::num::NonZeroUsize) -> Self {
-        match v.get() {
-            0 => unreachable!(),
-            1 => ThreadCount::One,
-            _ => ThreadCount::Many(v),
-        }
-    }
-}
-
-/// Single-threaded mode is shown as `-`; otherwise, an integer
-impl std::fmt::Display for ThreadCount {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            ThreadCount::One => write!(f, "-"),
-            #[cfg(not(target_arch = "wasm32"))]
-            ThreadCount::Many(n) => write!(f, "{n}"),
-        }
-    }
-}
-
-impl ThreadCount {
-    /// Gets the thread count
-    ///
-    /// Returns `None` if we are required to be single-threaded
-    pub fn get(&self) -> Option<usize> {
-        match self {
-            ThreadCount::One => None,
-            #[cfg(not(target_arch = "wasm32"))]
-            ThreadCount::Many(v) => Some(v.get()),
-        }
-    }
-}
-
-impl Default for ThreadCount {
-    #[cfg(target_arch = "wasm32")]
-    fn default() -> Self {
-        Self::One
-    }
-
-    #[cfg(not(target_arch = "wasm32"))]
-    fn default() -> Self {
-        Self::Many(std::num::NonZeroUsize::new(8).unwrap())
-    }
+/// Most users will use the global Rayon pool, but it's possible to provide your
+/// own as well.
+#[derive(Clone)]
+pub enum ThreadPool<'a> {
+    /// User-provided pool
+    Custom(&'a rayon::ThreadPool),
+    /// Global Rayon pool
+    Global,
 }
 
 /// Settings for 2D rendering
-pub struct ImageRenderConfig {
+pub struct ImageRenderConfig<'a> {
     /// Render size
     pub image_size: ImageSize,
 
@@ -85,22 +32,25 @@ pub struct ImageRenderConfig {
     /// to select this based on evaluator type.
     pub tile_sizes: TileSizes,
 
-    /// Number of worker threads
-    pub threads: ThreadCount,
+    /// Thread pool to use for rendering
+    ///
+    /// If this is `None`, then rendering is done in a single thread; otherwise,
+    /// the provided pool is used.
+    pub threads: Option<ThreadPool<'a>>,
 }
 
-impl Default for ImageRenderConfig {
+impl Default for ImageRenderConfig<'_> {
     fn default() -> Self {
         Self {
             image_size: ImageSize::from(512),
             tile_sizes: TileSizes::new(&[128, 32, 8]).unwrap(),
             view: View2::default(),
-            threads: ThreadCount::default(),
+            threads: Some(ThreadPool::Global),
         }
     }
 }
 
-impl ImageRenderConfig {
+impl ImageRenderConfig<'_> {
     /// Render a shape in 2D using this configuration
     pub fn run<F: Function, M: RenderMode + Sync>(
         &self,
@@ -125,7 +75,7 @@ impl ImageRenderConfig {
 }
 
 /// Settings for 3D rendering
-pub struct VoxelRenderConfig {
+pub struct VoxelRenderConfig<'a> {
     /// Render size
     ///
     /// The resulting image will have the given width and height; depth sets the
@@ -143,23 +93,26 @@ pub struct VoxelRenderConfig {
     /// to select this based on evaluator type.
     pub tile_sizes: TileSizes,
 
-    /// Number of worker threads
-    pub threads: ThreadCount,
+    /// Thread pool to use for rendering
+    ///
+    /// If this is `None`, then rendering is done in a single thread; otherwise,
+    /// the provided pool is used.
+    pub threads: Option<ThreadPool<'a>>,
 }
 
-impl Default for VoxelRenderConfig {
+impl Default for VoxelRenderConfig<'_> {
     fn default() -> Self {
         Self {
             image_size: VoxelSize::from(512),
             tile_sizes: TileSizes::new(&[128, 64, 32, 16, 8]).unwrap(),
             view: View3::default(),
 
-            threads: ThreadCount::default(),
+            threads: Some(ThreadPool::Global),
         }
     }
 }
 
-impl VoxelRenderConfig {
+impl VoxelRenderConfig<'_> {
     /// Render a shape in 3D using this configuration
     ///
     /// Returns a tuple of heightmap, RGB image.
@@ -183,11 +136,6 @@ impl VoxelRenderConfig {
     pub fn mat(&self) -> Matrix4<f32> {
         self.view.world_to_model() * self.image_size.screen_to_world()
     }
-
-    /// Returns the data offset of a row within a subtile
-    pub(crate) fn tile_row_offset(&self, tile: Tile<3>, row: usize) -> usize {
-        self.tile_sizes.pixel_offset(tile.add(Vector2::new(0, row)))
-    }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -214,25 +162,6 @@ impl<const N: usize> Tile<N> {
     }
 }
 
-/// Worker queue
-pub(crate) struct Queue<const N: usize> {
-    index: AtomicUsize,
-    tiles: Vec<Tile<N>>,
-}
-
-impl<const N: usize> Queue<N> {
-    pub fn new(tiles: Vec<Tile<N>>) -> Self {
-        Self {
-            index: AtomicUsize::new(0),
-            tiles,
-        }
-    }
-    pub fn next(&self) -> Option<Tile<N>> {
-        let index = self.index.fetch_add(1, Ordering::Relaxed);
-        self.tiles.get(index).cloned()
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 #[cfg(test)]
diff --git a/fidget/src/render/mod.rs b/fidget/src/render/mod.rs
index 15b76efb..33eb7342 100644
--- a/fidget/src/render/mod.rs
+++ b/fidget/src/render/mod.rs
@@ -14,7 +14,7 @@ mod render2d;
 mod render3d;
 mod view;
 
-pub use config::{ImageRenderConfig, ThreadCount, VoxelRenderConfig};
+pub use config::{ImageRenderConfig, ThreadPool, VoxelRenderConfig};
 pub use region::{ImageSize, RegionSize, VoxelSize};
 pub use view::{View2, View3};
 
diff --git a/fidget/src/render/render2d.rs b/fidget/src/render/render2d.rs
index 04f33086..56344391 100644
--- a/fidget/src/render/render2d.rs
+++ b/fidget/src/render/render2d.rs
@@ -2,12 +2,15 @@
 use super::RenderHandle;
 use crate::{
     eval::Function,
-    render::config::{ImageRenderConfig, Queue, Tile},
-    render::ThreadCount,
+    render::{
+        config::{ImageRenderConfig, ThreadPool, Tile},
+        TileSizes,
+    },
     shape::{Shape, ShapeBulkEval, ShapeTracingEval, ShapeVars},
     types::Interval,
 };
 use nalgebra::{Point2, Vector2};
+use rayon::prelude::*;
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -203,7 +206,7 @@ impl Scratch {
 
 /// Per-thread worker
 struct Worker<'a, F: Function, M: RenderMode> {
-    config: &'a ImageRenderConfig,
+    tile_sizes: &'a TileSizes,
     scratch: Scratch,
 
     eval_float_slice: ShapeBulkEval<F::FloatSliceEval>,
@@ -225,6 +228,17 @@ struct Worker<'a, F: Function, M: RenderMode> {
 }
 
 impl<F: Function, M: RenderMode> Worker<'_, F, M> {
+    fn render_tile(
+        &mut self,
+        shape: &mut RenderHandle<F>,
+        vars: &ShapeVars<f32>,
+        tile: Tile<2>,
+    ) -> Vec<M::Output> {
+        self.image = vec![M::Output::default(); self.tile_sizes[0].pow(2)];
+        self.render_tile_recurse(shape, vars, 0, tile);
+        std::mem::take(&mut self.image)
+    }
+
     fn render_tile_recurse(
         &mut self,
         shape: &mut RenderHandle<F>,
@@ -232,7 +246,7 @@ impl<F: Function, M: RenderMode> Worker<'_, F, M> {
         depth: usize,
         tile: Tile<2>,
     ) {
-        let tile_size = self.config.tile_sizes[depth];
+        let tile_size = self.tile_sizes[depth];
 
         // Find the interval bounds of the region, in screen coordinates
         let base = Point2::from(tile.corner).cast::<f32>();
@@ -250,7 +264,6 @@ impl<F: Function, M: RenderMode> Worker<'_, F, M> {
             IntervalAction::Fill(fill) => {
                 for y in 0..tile_size {
                     let start = self
-                        .config
                         .tile_sizes
                         .pixel_offset(tile.add(Vector2::new(0, y)));
                     self.image[start..][..tile_size].fill(fill);
@@ -274,7 +287,6 @@ impl<F: Function, M: RenderMode> Worker<'_, F, M> {
                     let v1 = vs[2] * (1.0 - y_frac) + vs[3] * y_frac;
 
                     let mut i = self
-                        .config
                         .tile_sizes
                         .pixel_offset(tile.add(Vector2::new(0, y)));
                     for x in 0..tile_size {
@@ -303,7 +315,7 @@ impl<F: Function, M: RenderMode> Worker<'_, F, M> {
             shape
         };
 
-        if let Some(next_tile_size) = self.config.tile_sizes.get(depth + 1) {
+        if let Some(next_tile_size) = self.tile_sizes.get(depth + 1) {
             let n = tile_size / next_tile_size;
             for j in 0..n {
                 for i in 0..n {
@@ -351,10 +363,7 @@ impl<F: Function, M: RenderMode> Worker<'_, F, M> {
 
         let mut index = 0;
         for j in 0..tile_size {
-            let o = self
-                .config
-                .tile_sizes
-                .pixel_offset(tile.add(Vector2::new(0, j)));
+            let o = self.tile_sizes.pixel_offset(tile.add(Vector2::new(0, j)));
             for i in 0..tile_size {
                 self.image[o + i] = M::pixel(out[index]);
                 index += 1;
@@ -365,37 +374,6 @@ impl<F: Function, M: RenderMode> Worker<'_, F, M> {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-fn worker<F: Function, M: RenderMode>(
-    mut shape: RenderHandle<F>,
-    vars: &ShapeVars<f32>,
-    queue: &Queue<2>,
-    config: &ImageRenderConfig,
-) -> Vec<(Tile<2>, Vec<M::Output>)> {
-    let mut out = vec![];
-    let scratch = Scratch::new(config.tile_sizes.last().pow(2));
-
-    let mut w: Worker<F, M> = Worker {
-        scratch,
-        image: vec![],
-        config,
-        eval_float_slice: Default::default(),
-        eval_interval: Default::default(),
-        tape_storage: vec![],
-        shape_storage: vec![],
-        workspace: Default::default(),
-    };
-
-    while let Some(tile) = queue.next() {
-        w.image = vec![M::Output::default(); config.tile_sizes[0].pow(2)];
-        w.render_tile_recurse(&mut shape, vars, 0, tile);
-        let pixels = std::mem::take(&mut w.image);
-        out.push((tile, pixels))
-    }
-    out
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 /// Renders the given tape into a 2D image at Z = 0 according to the provided
 /// configuration.
 ///
@@ -437,30 +415,52 @@ fn render_inner<F: Function, M: RenderMode + Sync>(
         }
     }
 
-    let queue = Queue::new(tiles);
-
     let mut rh = RenderHandle::new(shape);
     let _ = rh.i_tape(&mut vec![]); // populate i_tape before cloning
+    let init = || {
+        let scratch = Scratch::new(config.tile_sizes.last().pow(2));
+        let rh = rh.clone();
+
+        let worker = Worker::<F, M> {
+            scratch,
+            image: vec![],
+            tile_sizes: &config.tile_sizes,
+            eval_float_slice: Default::default(),
+            eval_interval: Default::default(),
+            tape_storage: vec![],
+            shape_storage: vec![],
+            workspace: Default::default(),
+        };
+        (worker, rh)
+    };
 
-    let out: Vec<_> = match config.threads {
-        ThreadCount::One => worker::<F, M>(rh, vars, &queue, config)
-            .into_iter()
-            .collect(),
-
-        #[cfg(not(target_arch = "wasm32"))]
-        ThreadCount::Many(v) => std::thread::scope(|s| {
-            let mut handles = vec![];
-            for _ in 0..v.get() {
-                let rh = rh.clone();
-                handles
-                    .push(s.spawn(|| worker::<F, M>(rh, vars, &queue, config)));
-            }
-            let mut out = vec![];
-            for h in handles {
-                out.extend(h.join().unwrap().into_iter());
+    let out: Vec<_> = match &config.threads {
+        None => {
+            let (mut worker, mut rh) = init();
+            tiles
+                .into_iter()
+                .map(|tile| {
+                    let pixels = worker.render_tile(&mut rh, vars, tile);
+                    (tile, pixels)
+                })
+                .collect()
+        }
+
+        Some(p) => {
+            let run = || {
+                tiles
+                    .into_par_iter()
+                    .map_init(init, |(w, rh), tile| {
+                        let pixels = w.render_tile(rh, vars, tile);
+                        (tile, pixels)
+                    })
+                    .collect()
+            };
+            match p {
+                ThreadPool::Custom(p) => p.install(run),
+                ThreadPool::Global => run(),
             }
-            out
-        }),
+        }
     };
 
     let mut image = vec![M::Output::default(); width * height];
diff --git a/fidget/src/render/render3d.rs b/fidget/src/render/render3d.rs
index d9029e9e..ef79c0c4 100644
--- a/fidget/src/render/render3d.rs
+++ b/fidget/src/render/render3d.rs
@@ -2,13 +2,16 @@
 use super::RenderHandle;
 use crate::{
     eval::Function,
-    render::config::{Queue, ThreadCount, Tile, VoxelRenderConfig},
+    render::{
+        config::{ThreadPool, Tile, VoxelRenderConfig},
+        TileSizes, VoxelSize,
+    },
     shape::{Shape, ShapeBulkEval, ShapeTracingEval, ShapeVars},
     types::{Grad, Interval},
 };
 
-use nalgebra::{Point3, Vector2, Vector3};
-use std::collections::HashMap;
+use nalgebra::{Point2, Point3, Vector2, Vector3};
+use rayon::prelude::*;
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -47,7 +50,8 @@ impl Scratch {
 ////////////////////////////////////////////////////////////////////////////////
 
 struct Worker<'a, F: Function> {
-    config: &'a VoxelRenderConfig,
+    tile_sizes: &'a TileSizes,
+    image_size: VoxelSize,
 
     /// Reusable workspace for evaluation, to minimize allocation
     scratch: Scratch,
@@ -66,21 +70,54 @@ struct Worker<'a, F: Function> {
 }
 
 impl<F: Function> Worker<'_, F> {
+    fn render_tile(
+        &mut self,
+        shape: &mut RenderHandle<F>,
+        vars: &ShapeVars<f32>,
+        tile: Tile<2>,
+    ) -> (Vec<u32>, Vec<[u8; 3]>) {
+        // Prepare local tile data to fill out
+        self.depth = vec![0; self.tile_sizes[0].pow(2)];
+        self.color = vec![[0u8; 3]; self.tile_sizes[0].pow(2)];
+        let root_tile_size = self.tile_sizes[0];
+        for k in (0..self.image_size[2].div_ceil(root_tile_size as u32)).rev() {
+            let tile = Tile::new(Point3::new(
+                tile.corner.x,
+                tile.corner.y,
+                k as usize * root_tile_size,
+            ));
+            if !self.render_tile_recurse(shape, vars, 0, tile) {
+                break;
+            }
+        }
+        let depth = std::mem::take(&mut self.depth);
+        let color = std::mem::take(&mut self.color);
+        (depth, color)
+    }
+
+    /// Returns the data offset of a row within a subtile
+    pub(crate) fn tile_row_offset(&self, tile: Tile<3>, row: usize) -> usize {
+        self.tile_sizes.pixel_offset(tile.add(Vector2::new(0, row)))
+    }
+
+    /// Render a single tile
+    ///
+    /// Returns `true` if we should keep rendering, `false` otherwise
     fn render_tile_recurse(
         &mut self,
         shape: &mut RenderHandle<F>,
         vars: &ShapeVars<f32>,
         depth: usize,
         tile: Tile<3>,
-    ) {
+    ) -> bool {
         // Early exit if every single pixel is filled
-        let tile_size = self.config.tile_sizes[depth];
+        let tile_size = self.tile_sizes[depth];
         let fill_z = (tile.corner[2] + tile_size + 1).try_into().unwrap();
         if (0..tile_size).all(|y| {
-            let i = self.config.tile_row_offset(tile, y);
+            let i = self.tile_row_offset(tile, y);
             (0..tile_size).all(|x| self.depth[i + x] >= fill_z)
         }) {
-            return;
+            return false;
         }
 
         let base = Point3::from(tile.corner).cast::<f32>();
@@ -97,14 +134,14 @@ impl<F: Function> Worker<'_, F> {
         // `data_interval` to scratch memory for reuse.
         if i.upper() < 0.0 {
             for y in 0..tile_size {
-                let i = self.config.tile_row_offset(tile, y);
+                let i = self.tile_row_offset(tile, y);
                 for x in 0..tile_size {
                     self.depth[i + x] = self.depth[i + x].max(fill_z);
                 }
             }
-            return;
+            return false; // completely full, stop rendering
         } else if i.lower() > 0.0 {
-            return;
+            return true; // complete empty, keep going
         }
 
         // Calculate a simplified tape based on the trace
@@ -120,7 +157,7 @@ impl<F: Function> Worker<'_, F> {
         };
 
         // Recurse!
-        if let Some(next_tile_size) = self.config.tile_sizes.get(depth + 1) {
+        if let Some(next_tile_size) = self.tile_sizes.get(depth + 1) {
             let n = tile_size / next_tile_size;
 
             for j in 0..n {
@@ -142,6 +179,7 @@ impl<F: Function> Worker<'_, F> {
             self.render_tile_pixels(sub_tape, vars, tile_size, tile);
         };
         // TODO recycle something here?
+        true // keep going
     }
 
     fn render_tile_pixels(
@@ -161,10 +199,7 @@ impl<F: Function> Worker<'_, F> {
             let i = xy % tile_size;
             let j = xy / tile_size;
 
-            let o = self
-                .config
-                .tile_sizes
-                .pixel_offset(tile.add(Vector2::new(i, j)));
+            let o = self.tile_sizes.pixel_offset(tile.add(Vector2::new(i, j)));
 
             // Skip pixels which are behind the image
             let zmax = (tile.corner[2] + tile_size).try_into().unwrap();
@@ -230,10 +265,7 @@ impl<F: Function> Worker<'_, F> {
             let k = tile_size - 1 - k;
 
             // Set the depth of the pixel
-            let o = self
-                .config
-                .tile_sizes
-                .pixel_offset(tile.add(Vector2::new(i, j)));
+            let o = self.tile_sizes.pixel_offset(tile.add(Vector2::new(i, j)));
             let z = (tile.corner[2] + k + 1).try_into().unwrap();
             assert!(self.depth[o] < z);
             self.depth[o] = z;
@@ -277,86 +309,6 @@ impl<F: Function> Worker<'_, F> {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#[derive(Default)]
-struct Image {
-    depth: Vec<u32>,
-    color: Vec<[u8; 3]>,
-}
-
-impl Image {
-    fn new(size: usize) -> Self {
-        Self {
-            depth: vec![0; size.pow(2)],
-            color: vec![[0; 3]; size.pow(2)],
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-fn worker<F: Function>(
-    mut shape: RenderHandle<F>,
-    vars: &ShapeVars<f32>,
-    queues: &[Queue<3>],
-    mut index: usize,
-    config: &VoxelRenderConfig,
-) -> HashMap<[usize; 2], Image> {
-    let mut out = HashMap::new();
-
-    // Calculate maximum evaluation buffer size
-    let buf_size = config.tile_sizes.last();
-    let scratch = Scratch::new(buf_size);
-    let mut w: Worker<F> = Worker {
-        scratch,
-        depth: vec![],
-        color: vec![],
-        config,
-
-        eval_float_slice: Default::default(),
-        eval_interval: Default::default(),
-        eval_grad_slice: Default::default(),
-
-        tape_storage: vec![],
-        shape_storage: vec![],
-        workspace: Default::default(),
-    };
-
-    // Every thread has a set of tiles assigned to it, which are in Z-sorted
-    // order (to encourage culling).  Once the thread finishes its tiles, it
-    // begins stealing from other thread queues; if every single thread queue is
-    // empty, then we return.
-    let start = index;
-    loop {
-        while let Some(tile) = queues[index].next() {
-            let image = out
-                .remove(&[tile.corner[0], tile.corner[1]])
-                .unwrap_or_else(|| Image::new(config.tile_sizes[0]));
-
-            // Prepare to render, allocating space for a tile
-            w.depth = image.depth;
-            w.color = image.color;
-            w.render_tile_recurse(&mut shape, vars, 0, tile);
-
-            // Steal the tile, replacing it with an empty vec
-            let depth = std::mem::take(&mut w.depth);
-            let color = std::mem::take(&mut w.color);
-            out.insert(
-                [tile.corner[0], tile.corner[1]],
-                Image { depth, color },
-            );
-        }
-        // Move on to the next thread's queue
-        index = (index + 1) % queues.len();
-        if index == start {
-            break;
-        }
-    }
-
-    out
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 /// Renders the given tape into a 3D image according to the provided
 /// configuration.
 ///
@@ -376,70 +328,83 @@ pub fn render<F: Function>(
     let t = config.tile_sizes[0];
     let width = config.image_size[0] as usize;
     let height = config.image_size[1] as usize;
-    let depth = config.image_size[2] as usize;
     for i in 0..width.div_ceil(t) {
         for j in 0..height.div_ceil(t) {
-            for k in (0..depth.div_ceil(t)).rev() {
-                tiles.push(Tile::new(Point3::new(
-                    i * config.tile_sizes[0],
-                    j * config.tile_sizes[0],
-                    k * config.tile_sizes[0],
-                )));
-            }
+            tiles.push(Tile::new(Point2::new(
+                i * config.tile_sizes[0],
+                j * config.tile_sizes[0],
+            )));
         }
     }
 
-    let threads = config.threads.get().unwrap_or(1);
-    let tiles_per_thread = (tiles.len() / threads).max(1);
-    let mut tile_queues = vec![];
-    for ts in tiles.chunks(tiles_per_thread) {
-        tile_queues.push(Queue::new(ts.to_vec()));
-    }
-    tile_queues.resize_with(threads, || Queue::new(vec![]));
-
     let mut rh = RenderHandle::new(shape);
     let _ = rh.i_tape(&mut vec![]); // populate i_tape before cloning
 
+    let init = || {
+        let rh = rh.clone();
+        let buf_size = config.tile_sizes.last();
+        let scratch = Scratch::new(buf_size);
+        let worker: Worker<F> = Worker {
+            scratch,
+            depth: vec![],
+            color: vec![],
+            tile_sizes: &config.tile_sizes,
+            image_size: config.image_size,
+
+            eval_float_slice: Default::default(),
+            eval_interval: Default::default(),
+            eval_grad_slice: Default::default(),
+
+            tape_storage: vec![],
+            shape_storage: vec![],
+            workspace: Default::default(),
+        };
+        (worker, rh)
+    };
+
     // Special-case for single-threaded operation, to give simpler backtraces
-    let out: Vec<_> = match config.threads {
-        ThreadCount::One => {
-            worker::<F>(rh, vars, tile_queues.as_slice(), 0, config)
+    let out: Vec<_> = match &config.threads {
+        None => {
+            let (mut worker, mut rh) = init();
+            tiles
                 .into_iter()
+                .map(|tile| {
+                    let pixels = worker.render_tile(&mut rh, vars, tile);
+                    (tile, pixels)
+                })
                 .collect()
         }
 
-        #[cfg(not(target_arch = "wasm32"))]
-        ThreadCount::Many(threads) => std::thread::scope(|s| {
-            let config = &config;
-            let mut handles = vec![];
-            let queues = tile_queues.as_slice();
-            for i in 0..threads.get() {
-                let rh = rh.clone();
-                handles.push(
-                    s.spawn(move || worker::<F>(rh, vars, queues, i, config)),
-                );
-            }
-            let mut out = vec![];
-            for h in handles {
-                out.extend(h.join().unwrap().into_iter());
+        Some(p) => {
+            let run = || {
+                tiles
+                    .into_par_iter()
+                    .map_init(init, |(w, rh), tile| {
+                        let pixels = w.render_tile(rh, vars, tile);
+                        (tile, pixels)
+                    })
+                    .collect()
+            };
+            match p {
+                ThreadPool::Custom(p) => p.install(run),
+                ThreadPool::Global => run(),
             }
-            out
-        }),
+        }
     };
 
     let mut image_depth = vec![0; width * height];
     let mut image_color = vec![[0; 3]; width * height];
-    for (tile, patch) in out.iter() {
+    for (tile, (depth, color)) in out {
         let mut index = 0;
         for j in 0..config.tile_sizes[0] {
-            let y = j + tile[1];
+            let y = j + tile.corner.y;
             for i in 0..config.tile_sizes[0] {
-                let x = i + tile[0];
+                let x = i + tile.corner.x;
                 if x < width && y < height {
                     let o = y * width + x;
-                    if patch.depth[index] >= image_depth[o] {
-                        image_color[o] = patch.color[index];
-                        image_depth[o] = patch.depth[index];
+                    if depth[index] >= image_depth[o] {
+                        image_color[o] = color[index];
+                        image_depth[o] = depth[index];
                     }
                 }
                 index += 1;
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index 97413789..5b3628d8 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -20,7 +20,7 @@ approx = { version = "0.5" }
 bytemuck = { version = "1", default-features = false, features = ["derive", "extern_crate_alloc"] }
 clap = { version = "4", features = ["derive"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "suggestions", "usage"] }
-crossbeam-utils = { version = "0.8" }
+either = { version = "1", default-features = false, features = ["use_std"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 num-traits = { version = "0.2", features = ["i128"] }
 once_cell = { version = "1" }