diff --git a/Cargo.lock b/Cargo.lock
index 8ed75c5f..b201f3ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2518,7 +2518,6 @@ name = "vello"
 version = "0.4.0"
 dependencies = [
  "bytemuck",
- "futures-intrusive",
  "log",
  "peniko",
  "png",
diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs
index 98bcdcdd..e2e59c0f 100644
--- a/examples/headless/src/main.rs
+++ b/examples/headless/src/main.rs
@@ -97,7 +97,6 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
     let mut renderer = vello::Renderer::new(
         device,
         RendererOptions {
-            surface_format: None,
             use_cpu: args.use_cpu,
             num_init_threads: NonZeroUsize::new(1),
             antialiasing_support: vello::AaSupport::area_only(),
diff --git a/examples/scenes/src/test_scenes.rs b/examples/scenes/src/test_scenes.rs
index fce91edd..51c0806e 100644
--- a/examples/scenes/src/test_scenes.rs
+++ b/examples/scenes/src/test_scenes.rs
@@ -76,6 +76,7 @@ export_scenes!(
     labyrinth(labyrinth),
     robust_paths(robust_paths),
     base_color_test(base_color_test: animated),
+    translucent_base(translucent_base),
     clip_test(clip_test: animated),
     longpathdash_butt(impls::longpathdash(Cap::Butt), "longpathdash (butt caps)", false),
     longpathdash_round(impls::longpathdash(Cap::Round), "longpathdash (round caps)", false),
@@ -1539,6 +1540,20 @@ mod impls {
         );
     }
 
+    pub(super) fn translucent_base(scene: &mut Scene, params: &mut SceneParams<'_>) {
+        let background_color = Color::TRANSPARENT;
+        params.base_color = Some(background_color);
+
+        // Blend a white square over it.
+        scene.fill(
+            Fill::NonZero,
+            Affine::IDENTITY,
+            palette::css::WHITE.with_alpha(0.5),
+            None,
+            &Rect::new(50.0, 50.0, 500.0, 500.0),
+        );
+    }
+
     pub(super) fn clip_test(scene: &mut Scene, params: &mut SceneParams<'_>) {
         let clip = {
             const X0: f64 = 50.0;
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
index 8a4a5159..3af1a71a 100644
--- a/examples/simple/src/main.rs
+++ b/examples/simple/src/main.rs
@@ -131,21 +131,15 @@ impl ApplicationHandler for SimpleVelloApp<'_> {
                 // Get a handle to the device
                 let device_handle = &self.context.devices[surface.dev_id];
 
-                // Get the surface's texture
-                let surface_texture = surface
-                    .surface
-                    .get_current_texture()
-                    .expect("failed to get surface texture");
-
-                // Render to the surface's texture
+                // Render to a texture, which we will later copy into the surface
                 self.renderers[surface.dev_id]
                     .as_mut()
                     .unwrap()
-                    .render_to_surface(
+                    .render_to_texture(
                         &device_handle.device,
                         &device_handle.queue,
                         &self.scene,
-                        &surface_texture,
+                        &surface.target_view,
                         &vello::RenderParams {
                             base_color: palette::css::BLACK, // Background color
                             width,
@@ -155,6 +149,28 @@ impl ApplicationHandler for SimpleVelloApp<'_> {
                     )
                     .expect("failed to render to surface");
 
+                // Get the surface's texture
+                let surface_texture = surface
+                    .surface
+                    .get_current_texture()
+                    .expect("failed to get surface texture");
+
+                // Perform the copy
+                let mut encoder =
+                    device_handle
+                        .device
+                        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                            label: Some("Surface Blit"),
+                        });
+                surface.blitter.copy(
+                    &device_handle.device,
+                    &mut encoder,
+                    &surface.target_view,
+                    &surface_texture
+                        .texture
+                        .create_view(&wgpu::TextureViewDescriptor::default()),
+                );
+                device_handle.queue.submit([encoder.finish()]);
                 // Queue the texture to be presented on the surface
                 surface_texture.present();
 
@@ -196,7 +212,6 @@ fn create_vello_renderer(render_cx: &RenderContext, surface: &RenderSurface<'_>)
     Renderer::new(
         &render_cx.devices[surface.dev_id].device,
         RendererOptions {
-            surface_format: Some(surface.format),
             use_cpu: false,
             antialiasing_support: vello::AaSupport::all(),
             num_init_threads: NonZeroUsize::new(1),
diff --git a/examples/simple_sdl2/src/main.rs b/examples/simple_sdl2/src/main.rs
index 125f0a7f..13b6d198 100644
--- a/examples/simple_sdl2/src/main.rs
+++ b/examples/simple_sdl2/src/main.rs
@@ -67,19 +67,14 @@ fn main() {
 
         let device_handle = &context.devices[surface.dev_id];
 
-        let surface_texture = surface
-            .surface
-            .get_current_texture()
-            .expect("failed to get surface texture");
-
         renderers[surface.dev_id]
             .as_mut()
             .unwrap()
-            .render_to_surface(
+            .render_to_texture(
                 &device_handle.device,
                 &device_handle.queue,
                 &scene,
-                &surface_texture,
+                &surface.target_view,
                 &vello::RenderParams {
                     base_color: palette::css::BLACK, // Background color
                     width,
@@ -89,6 +84,26 @@ fn main() {
             )
             .expect("failed to render to surface");
 
+        let surface_texture = surface
+            .surface
+            .get_current_texture()
+            .expect("failed to get surface texture");
+
+        let mut encoder =
+            device_handle
+                .device
+                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                    label: Some("Surface Blit"),
+                });
+        surface.blitter.copy(
+            &device_handle.device,
+            &mut encoder,
+            &surface.target_view,
+            &surface_texture
+                .texture
+                .create_view(&wgpu::TextureViewDescriptor::default()),
+        );
+        device_handle.queue.submit([encoder.finish()]);
         for event in event_pump.poll_iter() {
             match event {
                 Event::Quit { .. }
@@ -108,7 +123,6 @@ fn create_vello_renderer(render_cx: &RenderContext, surface: &RenderSurface<'_>)
     Renderer::new(
         &render_cx.devices[surface.dev_id].device,
         RendererOptions {
-            surface_format: Some(surface.format),
             use_cpu: false,
             antialiasing_support: vello::AaSupport::all(),
             num_init_threads: NonZeroUsize::new(1),
diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs
index 05a44351..77b73f31 100644
--- a/examples/with_winit/src/lib.rs
+++ b/examples/with_winit/src/lib.rs
@@ -45,7 +45,7 @@ use winit::dpi::LogicalSize;
 use winit::event_loop::EventLoop;
 use winit::window::{Window, WindowAttributes};
 
-use vello::wgpu;
+use vello::wgpu::{self, CommandEncoderDescriptor};
 
 #[cfg(not(any(target_arch = "wasm32", target_os = "android")))]
 mod hot_reload;
@@ -213,7 +213,6 @@ impl ApplicationHandler<UserEvent> for VelloApp<'_> {
                 let mut renderer = Renderer::new(
                     &self.context.devices[id].device,
                     RendererOptions {
-                        surface_format: Some(render_state.surface.format),
                         use_cpu: self.use_cpu,
                         antialiasing_support: AA_CONFIGS.iter().copied().collect(),
                         num_init_threads: NonZeroUsize::new(self.num_init_threads),
@@ -547,53 +546,47 @@ impl ApplicationHandler<UserEvent> for VelloApp<'_> {
                     }
                 }
                 drop(encoding_span);
-                let texture_span = tracing::trace_span!("Getting texture").entered();
-                let surface_texture = surface
-                    .surface
-                    .get_current_texture()
-                    .expect("failed to get surface texture");
-
-                drop(texture_span);
                 let render_span = tracing::trace_span!("Dispatching render").entered();
-                // Note: we don't run the async/"robust" pipeline, as
-                // it requires more async wiring for the readback. See
-                // [#gpu > async on wasm](https://xi.zulipchat.com/#narrow/stream/197075-gpu/topic/async.20on.20wasm)
-                #[allow(deprecated)]
-                // #[expect(deprecated, reason = "This deprecation is not targeted at us.")] // Our MSRV is too low to use `expect`
-                if self.async_pipeline && cfg!(not(target_arch = "wasm32")) {
-                    self.scene_complexity = vello::util::block_on_wgpu(
+                self.renderers[surface.dev_id]
+                    .as_mut()
+                    .unwrap()
+                    .render_to_texture(
                         &device_handle.device,
-                        self.renderers[surface.dev_id]
-                            .as_mut()
-                            .unwrap()
-                            .render_to_surface_async(
-                                &device_handle.device,
-                                &device_handle.queue,
-                                &self.scene,
-                                &surface_texture,
-                                &render_params,
-                                self.debug,
-                            ),
+                        &device_handle.queue,
+                        &self.scene,
+                        &surface.target_view,
+                        &render_params,
                     )
                     .expect("failed to render to surface");
-                } else {
-                    self.renderers[surface.dev_id]
-                        .as_mut()
-                        .unwrap()
-                        .render_to_surface(
-                            &device_handle.device,
-                            &device_handle.queue,
-                            &self.scene,
-                            &surface_texture,
-                            &render_params,
-                        )
-                        .expect("failed to render to surface");
-                }
-                surface_texture.present();
                 drop(render_span);
 
+                let texture_span = tracing::trace_span!("Blitting to surface").entered();
+                let surface_texture = surface
+                    .surface
+                    .get_current_texture()
+                    .expect("failed to get surface texture");
+                // Perform the copy
+                // (TODO: Does it improve throughput to acquire the surface after the previous texture render has happened?)
+                let mut encoder =
+                    device_handle
+                        .device
+                        .create_command_encoder(&CommandEncoderDescriptor {
+                            label: Some("Surface Blit"),
+                        });
+                surface.blitter.copy(
+                    &device_handle.device,
+                    &mut encoder,
+                    &surface.target_view,
+                    &surface_texture
+                        .texture
+                        .create_view(&wgpu::TextureViewDescriptor::default()),
+                );
+                device_handle.queue.submit([encoder.finish()]);
+                surface_texture.present();
+                drop(texture_span);
+
                 {
-                    let _poll_aspan = tracing::trace_span!("Polling wgpu device").entered();
+                    let _poll_span = tracing::trace_span!("Polling wgpu device").entered();
                     device_handle.device.poll(wgpu::Maintain::Poll);
                 }
                 let new_time = Instant::now();
@@ -789,6 +782,7 @@ fn window_attributes() -> WindowAttributes {
     Window::default_attributes()
         .with_inner_size(LogicalSize::new(1044, 800))
         .with_resizable(true)
+        .with_transparent(true)
         .with_title("Vello demo")
 }
 
diff --git a/vello/Cargo.toml b/vello/Cargo.toml
index ce0e238c..66a2e288 100644
--- a/vello/Cargo.toml
+++ b/vello/Cargo.toml
@@ -22,7 +22,7 @@ default = ["wgpu"]
 # bump-allocated GPU memory.
 # TODO: Turn this into a runtime option used at resolve time and remove the feature.
 bump_estimate = ["vello_encoding/bump_estimate"]
-wgpu = ["dep:wgpu", "dep:vello_shaders", "dep:futures-intrusive"]
+wgpu = ["dep:wgpu", "dep:vello_shaders"]
 
 # Development only features
 
@@ -50,7 +50,6 @@ peniko = { workspace = true }
 wgpu = { workspace = true, optional = true }
 log = { workspace = true }
 static_assertions = { workspace = true }
-futures-intrusive = { workspace = true, optional = true }
 wgpu-profiler = { workspace = true, optional = true }
 thiserror = { workspace = true }
 # TODO: Add feature for built-in bitmap emoji support?
diff --git a/vello/src/debug.rs b/vello/src/debug.rs
index 9a389a42..572b2d39 100644
--- a/vello/src/debug.rs
+++ b/vello/src/debug.rs
@@ -1,15 +1,15 @@
 // Copyright 2023 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
-#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
-mod renderer;
-#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
-mod validate;
+// #[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+// mod renderer;
+// #[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+// mod validate;
 
 use std::fmt::Debug;
 
-#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
-pub(crate) use renderer::*;
+// #[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+// pub(crate) use renderer::*;
 
 /// Bitflags for enabled debug operations.
 ///
diff --git a/vello/src/lib.rs b/vello/src/lib.rs
index 83ca57cf..915680f1 100644
--- a/vello/src/lib.rs
+++ b/vello/src/lib.rs
@@ -154,25 +154,20 @@ pub use wgpu;
 pub use scene::{DrawGlyphs, Scene};
 pub use vello_encoding::{Glyph, NormalizedCoord};
 
-use low_level::ShaderId;
 #[cfg(feature = "wgpu")]
-use low_level::{
-    BindType, BumpAllocators, FullShaders, ImageFormat, ImageProxy, Recording, Render,
-    ResourceProxy,
-};
+use low_level::FullShaders;
+use low_level::ShaderId;
 use thiserror::Error;
 
-#[cfg(feature = "wgpu")]
-use debug::DebugLayers;
 #[cfg(feature = "wgpu")]
 use vello_encoding::Resolver;
 #[cfg(feature = "wgpu")]
 use wgpu_engine::{ExternalResource, WgpuEngine};
 
 #[cfg(feature = "wgpu")]
-use std::{num::NonZeroUsize, sync::atomic::AtomicBool};
+use std::num::NonZeroUsize;
 #[cfg(feature = "wgpu")]
-use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView};
+use wgpu::{Device, Queue, TextureView};
 #[cfg(all(feature = "wgpu", feature = "wgpu-profiler"))]
 use wgpu_profiler::{GpuProfiler, GpuProfilerSettings};
 
@@ -341,10 +336,6 @@ pub struct Renderer {
     engine: WgpuEngine,
     resolver: Resolver,
     shaders: FullShaders,
-    blit: Option<BlitPipeline>,
-    #[cfg(feature = "debug_layers")]
-    debug: Option<debug::DebugRenderer>,
-    target: Option<TargetTexture>,
     #[cfg(feature = "wgpu-profiler")]
     #[doc(hidden)] // End-users of Vello should not have `wgpu-profiler` enabled.
     /// The profiler used with events for this renderer. This is *not* treated as public API.
@@ -382,10 +373,6 @@ pub struct RenderParams {
 #[cfg(feature = "wgpu")]
 /// Options which are set at renderer creation time, used in [`Renderer::new`].
 pub struct RendererOptions {
-    /// The format of the texture used for surfaces with this renderer/device
-    /// If None, the renderer cannot be used with surfaces
-    pub surface_format: Option<TextureFormat>,
-
     /// If true, run all stages up to fine rasterization on the CPU.
     // TODO: Consider evolving this so that the CPU stages can be configured dynamically via
     // `RenderParams`.
@@ -406,12 +393,12 @@ pub struct RendererOptions {
     pub num_init_threads: Option<NonZeroUsize>,
 }
 
-#[cfg(feature = "wgpu")]
-struct RenderResult {
-    bump: Option<BumpAllocators>,
-    #[cfg(feature = "debug_layers")]
-    captured: Option<render::CapturedBuffers>,
-}
+// #[cfg(feature = "wgpu")]
+// struct RenderResult {
+//     bump: Option<BumpAllocators>,
+//     #[cfg(feature = "debug_layers")]
+//     captured: Option<render::CapturedBuffers>,
+// }
 
 #[cfg(feature = "wgpu")]
 impl Renderer {
@@ -426,24 +413,12 @@ impl Renderer {
         let shaders = shaders::full_shaders(device, &mut engine, &options)?;
         #[cfg(not(target_arch = "wasm32"))]
         engine.build_shaders_if_needed(device, options.num_init_threads);
-        let blit = options
-            .surface_format
-            .map(|surface_format| BlitPipeline::new(device, surface_format, &mut engine))
-            .transpose()?;
-        #[cfg(feature = "debug_layers")]
-        let debug = options
-            .surface_format
-            .map(|surface_format| debug::DebugRenderer::new(device, surface_format, &mut engine));
 
         Ok(Self {
             options,
             engine,
             resolver: Resolver::new(),
             shaders,
-            blit,
-            #[cfg(feature = "debug_layers")]
-            debug,
-            target: None,
             #[cfg(feature = "wgpu-profiler")]
             profiler: GpuProfiler::new(GpuProfilerSettings {
                 ..Default::default()
@@ -484,91 +459,6 @@ impl Renderer {
         Ok(())
     }
 
-    /// Renders a scene to the target surface.
-    ///
-    /// This renders to an intermediate texture and then runs a render pass to blit to the
-    /// specified surface texture.
-    ///
-    /// The surface is assumed to be of the specified dimensions and have been configured with
-    /// the same format passed in the constructing [`RendererOptions`]' `surface_format`.
-    /// Panics if `surface_format` was `None`
-    pub fn render_to_surface(
-        &mut self,
-        device: &Device,
-        queue: &Queue,
-        scene: &Scene,
-        surface: &SurfaceTexture,
-        params: &RenderParams,
-    ) -> Result<()> {
-        let width = params.width;
-        let height = params.height;
-        let mut target = self
-            .target
-            .take()
-            .unwrap_or_else(|| TargetTexture::new(device, width, height));
-        // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator
-        // during resize, specifically on metal.
-        if target.width != width || target.height != height {
-            target = TargetTexture::new(device, width, height);
-        }
-        self.render_to_texture(device, queue, scene, &target.view, params)?;
-        let blit = self
-            .blit
-            .as_ref()
-            .expect("renderer should have configured surface_format to use on a surface");
-        let mut recording = Recording::default();
-        let target_proxy = ImageProxy::new(
-            width,
-            height,
-            ImageFormat::from_wgpu(target.format)
-                .expect("`TargetTexture` always has a supported texture format"),
-        );
-        let surface_proxy = ImageProxy::new(
-            width,
-            height,
-            ImageFormat::from_wgpu(surface.texture.format())
-                .ok_or(Error::UnsupportedSurfaceFormat)?,
-        );
-        recording.draw(recording::DrawParams {
-            shader_id: blit.0,
-            instance_count: 1,
-            vertex_count: 6,
-            vertex_buffer: None,
-            resources: vec![ResourceProxy::Image(target_proxy)],
-            target: surface_proxy,
-            clear_color: Some([0., 0., 0., 0.]),
-        });
-
-        let surface_view = surface
-            .texture
-            .create_view(&wgpu::TextureViewDescriptor::default());
-        let external_resources = [
-            ExternalResource::Image(target_proxy, &target.view),
-            ExternalResource::Image(surface_proxy, &surface_view),
-        ];
-        self.engine.run_recording(
-            device,
-            queue,
-            &recording,
-            &external_resources,
-            "blit (render_to_surface)",
-            #[cfg(feature = "wgpu-profiler")]
-            &mut self.profiler,
-        )?;
-        self.target = Some(target);
-        #[cfg(feature = "wgpu-profiler")]
-        {
-            self.profiler.end_frame().unwrap();
-            if let Some(result) = self
-                .profiler
-                .process_finished_frame(queue.get_timestamp_period())
-            {
-                self.profile_result = Some(result);
-            }
-        }
-        Ok(())
-    }
-
     /// Overwrite `image` with `texture`.
     ///
     /// Whenever `image` would be rendered, instead the given `Texture` will be used.
@@ -597,398 +487,64 @@ impl Renderer {
         let mut engine = WgpuEngine::new(self.options.use_cpu);
         // We choose not to initialise these shaders in parallel, to ensure the error scope works correctly
         let shaders = shaders::full_shaders(device, &mut engine, &self.options)?;
-        let blit = self
-            .options
-            .surface_format
-            .map(|surface_format| BlitPipeline::new(device, surface_format, &mut engine))
-            .transpose()?;
-        #[cfg(feature = "debug_layers")]
-        let debug = self
-            .options
-            .surface_format
-            .map(|format| debug::DebugRenderer::new(device, format, &mut engine));
+
         let error = device.pop_error_scope().await;
         if let Some(error) = error {
             return Err(error.into());
         }
         self.engine = engine;
         self.shaders = shaders;
-        self.blit = blit;
-        #[cfg(feature = "debug_layers")]
-        {
-            self.debug = debug;
-        }
         Ok(())
     }
-
-    /// Renders a scene to the target texture using an async pipeline.
-    ///
-    /// Almost all consumers should prefer [`Self::render_to_texture`].
-    ///
-    /// The texture is assumed to be of the specified dimensions and have been created with
-    /// the [`wgpu::TextureFormat::Rgba8Unorm`] format and the [`wgpu::TextureUsages::STORAGE_BINDING`]
-    /// flag set.
-    ///
-    /// The return value is the value of the `BumpAllocators` in this rendering, which is currently used
-    /// for debug output.
-    ///
-    /// This return type is not stable, and will likely be changed when a more principled way to access
-    /// relevant statistics is implemented
-    #[cfg_attr(docsrs, doc(hidden))]
-    #[deprecated(
-        note = "render_to_texture should be preferred, as the _async version has no stability guarantees"
-    )]
-    pub async fn render_to_texture_async(
-        &mut self,
-        device: &Device,
-        queue: &Queue,
-        scene: &Scene,
-        texture: &TextureView,
-        params: &RenderParams,
-    ) -> Result<Option<BumpAllocators>> {
-        let result = self
-            .render_to_texture_async_internal(device, queue, scene, texture, params)
-            .await?;
-        #[cfg(feature = "debug_layers")]
-        {
-            // TODO: it would be better to improve buffer ownership tracking so that it's not
-            // necessary to submit a whole new Recording to free the captured buffers.
-            if let Some(captured) = result.captured {
-                let mut recording = Recording::default();
-                // TODO: this sucks. better to release everything in a helper
-                self.engine.free_download(captured.lines);
-                captured.release_buffers(&mut recording);
-                self.engine.run_recording(
-                    device,
-                    queue,
-                    &recording,
-                    &[],
-                    "free memory",
-                    #[cfg(feature = "wgpu-profiler")]
-                    &mut self.profiler,
-                )?;
-            }
-        }
-        Ok(result.bump)
-    }
-
-    async fn render_to_texture_async_internal(
-        &mut self,
-        device: &Device,
-        queue: &Queue,
-        scene: &Scene,
-        texture: &TextureView,
-        params: &RenderParams,
-    ) -> Result<RenderResult> {
-        let mut render = Render::new();
-        let encoding = scene.encoding();
-        // TODO: turn this on; the download feature interacts with CPU dispatch.
-        // Currently this is always enabled when the `debug_layers` setting is enabled as the bump
-        // counts are used for debug visualiation.
-        let robust = cfg!(feature = "debug_layers");
-        let recording = render.render_encoding_coarse(
-            encoding,
-            &mut self.resolver,
-            &self.shaders,
-            params,
-            robust,
-        );
-        let target = render.out_image();
-        let bump_buf = render.bump_buf();
-        #[cfg(feature = "debug_layers")]
-        let captured = render.take_captured_buffers();
-        self.engine.run_recording(
-            device,
-            queue,
-            &recording,
-            &[],
-            "t_async_coarse",
-            #[cfg(feature = "wgpu-profiler")]
-            &mut self.profiler,
-        )?;
-
-        let mut bump: Option<BumpAllocators> = None;
-        if let Some(bump_buf) = self.engine.get_download(bump_buf) {
-            let buf_slice = bump_buf.slice(..);
-            let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
-            buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
-            receiver.receive().await.expect("channel was closed")?;
-            let mapped = buf_slice.get_mapped_range();
-            bump = Some(bytemuck::pod_read_unaligned(&mapped));
-        }
-        // TODO: apply logic to determine whether we need to rerun coarse, and also
-        // allocate the blend stack as needed.
-        self.engine.free_download(bump_buf);
-        // Maybe clear to reuse allocation?
-        let mut recording = Recording::default();
-        render.record_fine(&self.shaders, &mut recording);
-        let external_resources = [ExternalResource::Image(target, texture)];
-        self.engine.run_recording(
-            device,
-            queue,
-            &recording,
-            &external_resources,
-            "t_async_fine",
-            #[cfg(feature = "wgpu-profiler")]
-            &mut self.profiler,
-        )?;
-        Ok(RenderResult {
-            bump,
-            #[cfg(feature = "debug_layers")]
-            captured,
-        })
-    }
-
-    /// This is a version of [`render_to_surface`](Self::render_to_surface) which uses an async pipeline
-    /// to allow improved debugging of Vello itself.
-    /// Most users should prefer `render_to_surface`.
-    ///
-    /// See [`render_to_texture_async`](Self::render_to_texture_async) for more details.
-    #[cfg_attr(docsrs, doc(hidden))]
-    #[deprecated(
-        note = "render_to_surface should be preferred, as the _async version has no stability guarantees"
-    )]
-    pub async fn render_to_surface_async(
-        &mut self,
-        device: &Device,
-        queue: &Queue,
-        scene: &Scene,
-        surface: &SurfaceTexture,
-        params: &RenderParams,
-        debug_layers: DebugLayers,
-    ) -> Result<Option<BumpAllocators>> {
-        if cfg!(not(feature = "debug_layers")) && !debug_layers.is_empty() {
-            static HAS_WARNED: AtomicBool = AtomicBool::new(false);
-            if !HAS_WARNED.swap(true, std::sync::atomic::Ordering::Release) {
-                log::warn!(
-                    "Requested debug layers {debug:?} but `debug_layers` feature is not enabled.",
-                    debug = debug_layers
-                );
-            }
-        }
-
-        let width = params.width;
-        let height = params.height;
-        let mut target = self
-            .target
-            .take()
-            .unwrap_or_else(|| TargetTexture::new(device, width, height));
-        // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator
-        // during resize, specifically on metal.
-        if target.width != width || target.height != height {
-            target = TargetTexture::new(device, width, height);
-        }
-        let result = self
-            .render_to_texture_async_internal(device, queue, scene, &target.view, params)
-            .await?;
-        let blit = self
-            .blit
-            .as_ref()
-            .expect("renderer should have configured surface_format to use on a surface");
-        let mut recording = Recording::default();
-        let target_proxy = ImageProxy::new(
-            width,
-            height,
-            ImageFormat::from_wgpu(target.format)
-                .expect("`TargetTexture` always has a supported texture format"),
-        );
-        let surface_proxy = ImageProxy::new(
-            width,
-            height,
-            ImageFormat::from_wgpu(surface.texture.format())
-                .ok_or(Error::UnsupportedSurfaceFormat)?,
-        );
-        recording.draw(recording::DrawParams {
-            shader_id: blit.0,
-            instance_count: 1,
-            vertex_count: 6,
-            vertex_buffer: None,
-            resources: vec![ResourceProxy::Image(target_proxy)],
-            target: surface_proxy,
-            clear_color: Some([0., 0., 0., 0.]),
-        });
-
-        #[cfg(feature = "debug_layers")]
-        {
-            if let Some(captured) = result.captured {
-                let debug = self
-                    .debug
-                    .as_ref()
-                    .expect("renderer should have configured surface_format to use on a surface");
-                let bump = result.bump.as_ref().unwrap();
-                // TODO: We could avoid this download if `DebugLayers::VALIDATION` is unset.
-                let downloads = DebugDownloads::map(&self.engine, &captured, bump).await?;
-                debug.render(
-                    &mut recording,
-                    surface_proxy,
-                    &captured,
-                    bump,
-                    params,
-                    &downloads,
-                    debug_layers,
-                );
-
-                // TODO: this sucks. better to release everything in a helper
-                // TODO: it would be much better to have a way to safely destroy a buffer.
-                self.engine.free_download(captured.lines);
-                captured.release_buffers(&mut recording);
-            }
-        }
-
-        let surface_view = surface
-            .texture
-            .create_view(&wgpu::TextureViewDescriptor::default());
-        let external_resources = [
-            ExternalResource::Image(target_proxy, &target.view),
-            ExternalResource::Image(surface_proxy, &surface_view),
-        ];
-        self.engine.run_recording(
-            device,
-            queue,
-            &recording,
-            &external_resources,
-            "blit (render_to_surface_async)",
-            #[cfg(feature = "wgpu-profiler")]
-            &mut self.profiler,
-        )?;
-
-        #[cfg(feature = "wgpu-profiler")]
-        {
-            self.profiler.end_frame().unwrap();
-            if let Some(result) = self
-                .profiler
-                .process_finished_frame(queue.get_timestamp_period())
-            {
-                self.profile_result = Some(result);
-            }
-        }
-
-        self.target = Some(target);
-        Ok(result.bump)
-    }
-}
-
-#[cfg(feature = "wgpu")]
-struct TargetTexture {
-    view: TextureView,
-    width: u32,
-    height: u32,
-    format: TextureFormat,
-}
-
-#[cfg(feature = "wgpu")]
-impl TargetTexture {
-    fn new(device: &Device, width: u32, height: u32) -> Self {
-        let format = TextureFormat::Rgba8Unorm;
-        let texture = device.create_texture(&wgpu::TextureDescriptor {
-            label: None,
-            size: wgpu::Extent3d {
-                width,
-                height,
-                depth_or_array_layers: 1,
-            },
-            mip_level_count: 1,
-            sample_count: 1,
-            dimension: wgpu::TextureDimension::D2,
-            usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::TEXTURE_BINDING,
-            format,
-            view_formats: &[],
-        });
-        let view = texture.create_view(&wgpu::TextureViewDescriptor::default());
-        Self {
-            view,
-            width,
-            height,
-            format,
-        }
-    }
-}
-
-#[cfg(feature = "wgpu")]
-struct BlitPipeline(ShaderId);
-
-#[cfg(feature = "wgpu")]
-impl BlitPipeline {
-    fn new(device: &Device, format: TextureFormat, engine: &mut WgpuEngine) -> Result<Self> {
-        const SHADERS: &str = r#"
-            @vertex
-            fn vs_main(@builtin(vertex_index) ix: u32) -> @builtin(position) vec4<f32> {
-                // Generate a full screen quad in normalized device coordinates
-                var vertex = vec2(-1.0, 1.0);
-                switch ix {
-                    case 1u: {
-                        vertex = vec2(-1.0, -1.0);
-                    }
-                    case 2u, 4u: {
-                        vertex = vec2(1.0, -1.0);
-                    }
-                    case 5u: {
-                        vertex = vec2(1.0, 1.0);
-                    }
-                    default: {}
-                }
-                return vec4(vertex, 0.0, 1.0);
-            }
-
-            @group(0) @binding(0)
-            var fine_output: texture_2d<f32>;
-
-            @fragment
-            fn fs_main(@builtin(position) pos: vec4<f32>) -> @location(0) vec4<f32> {
-                let rgba_sep = textureLoad(fine_output, vec2<i32>(pos.xy), 0);
-                return vec4(rgba_sep.rgb * rgba_sep.a, rgba_sep.a);
-            }
-        "#;
-        let module = device.create_shader_module(wgpu::ShaderModuleDescriptor {
-            label: Some("blit shaders"),
-            source: wgpu::ShaderSource::Wgsl(SHADERS.into()),
-        });
-        let shader_id = engine.add_render_shader(
-            device,
-            "vello.blit",
-            &module,
-            "vs_main",
-            "fs_main",
-            wgpu::PrimitiveTopology::TriangleList,
-            wgpu::ColorTargetState {
-                format,
-                blend: None,
-                write_mask: wgpu::ColorWrites::ALL,
-            },
-            None,
-            &[(
-                BindType::ImageRead(
-                    ImageFormat::from_wgpu(format).ok_or(Error::UnsupportedSurfaceFormat)?,
-                ),
-                wgpu::ShaderStages::FRAGMENT,
-            )],
-        );
-        Ok(Self(shader_id))
-    }
-}
-
-#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
-pub(crate) struct DebugDownloads<'a> {
-    pub lines: wgpu::BufferSlice<'a>,
 }
 
-#[cfg(all(feature = "debug_layers", feature = "wgpu"))]
-impl<'a> DebugDownloads<'a> {
-    pub async fn map(
-        engine: &'a WgpuEngine,
-        captured: &render::CapturedBuffers,
-        bump: &BumpAllocators,
-    ) -> Result<DebugDownloads<'a>> {
-        use vello_encoding::LineSoup;
-
-        let Some(lines_buf) = engine.get_download(captured.lines) else {
-            return Err(Error::DownloadError("linesoup"));
-        };
-
-        let lines = lines_buf.slice(..bump.lines as u64 * size_of::<LineSoup>() as u64);
-        let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
-        lines.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
-        receiver.receive().await.expect("channel was closed")?;
-        Ok(Self { lines })
-    }
-}
+// fn new(device: &Device, width: u32, height: u32) -> Self {
+//     let format = TextureFormat::Rgba8Unorm;
+//     let texture = device.create_texture(&wgpu::TextureDescriptor {
+//         label: None,
+//         size: wgpu::Extent3d {
+//             width,
+//             height,
+//             depth_or_array_layers: 1,
+//         },
+//         mip_level_count: 1,
+//         sample_count: 1,
+//         dimension: wgpu::TextureDimension::D2,
+//         usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::TEXTURE_BINDING,
+//         format,
+//         view_formats: &[],
+//     });
+//     let view = texture.create_view(&wgpu::TextureViewDescriptor::default());
+//     Self {
+//         view,
+//         width,
+//         height,
+//         format,
+//     }
+// }
+
+// #[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+// pub(crate) struct DebugDownloads<'a> {
+//     pub lines: wgpu::BufferSlice<'a>,
+// }
+
+// #[cfg(all(feature = "debug_layers", feature = "wgpu"))]
+// impl<'a> DebugDownloads<'a> {
+//     pub async fn map(
+//         engine: &'a WgpuEngine,
+//         captured: &render::CapturedBuffers,
+//         bump: &BumpAllocators,
+//     ) -> Result<DebugDownloads<'a>> {
+//         use vello_encoding::LineSoup;
+
+//         let Some(lines_buf) = engine.get_download(captured.lines) else {
+//             return Err(Error::DownloadError("linesoup"));
+//         };
+
+//         let lines = lines_buf.slice(..bump.lines as u64 * size_of::<LineSoup>() as u64);
+//         let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel();
+//         lines.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap());
+//         receiver.receive().await.expect("channel was closed")?;
+//         Ok(Self { lines })
+//     }
+// }
diff --git a/vello/src/util.rs b/vello/src/util.rs
index e14ad376..f6fcb764 100644
--- a/vello/src/util.rs
+++ b/vello/src/util.rs
@@ -6,8 +6,9 @@
 use std::future::Future;
 
 use wgpu::{
-    Adapter, Device, Instance, Limits, Queue, Surface, SurfaceConfiguration, SurfaceTarget,
-    TextureFormat,
+    util::{TextureBlitter, TextureBlitterBuilder},
+    Adapter, BlendComponent, BlendFactor, BlendOperation, BlendState, Device, Instance, Limits,
+    Queue, Surface, SurfaceConfiguration, SurfaceTarget, Texture, TextureFormat, TextureView,
 };
 
 use crate::{Error, Result};
@@ -89,14 +90,28 @@ impl RenderContext {
             height,
             present_mode,
             desired_maximum_frame_latency: 2,
-            alpha_mode: wgpu::CompositeAlphaMode::Auto,
+            alpha_mode: wgpu::CompositeAlphaMode::PreMultiplied,
             view_formats: vec![],
         };
+        let (target_texture, target_view) = create_targets(width, height, &device_handle.device);
+        let premul_blitter = TextureBlitterBuilder::new(&device_handle.device, format)
+            .blend_state(BlendState {
+                alpha: BlendComponent::REPLACE,
+                color: BlendComponent {
+                    src_factor: BlendFactor::SrcAlpha,
+                    dst_factor: BlendFactor::Zero,
+                    operation: BlendOperation::Add,
+                },
+            })
+            .build();
         let surface = RenderSurface {
             surface,
             config,
             dev_id,
             format,
+            target_texture,
+            target_view,
+            blitter: premul_blitter,
         };
         self.configure_surface(&surface);
         Ok(surface)
@@ -104,6 +119,11 @@ impl RenderContext {
 
     /// Resizes the surface to the new dimensions.
     pub fn resize_surface(&self, surface: &mut RenderSurface<'_>, width: u32, height: u32) {
+        let (texture, view) = create_targets(width, height, &self.devices[surface.dev_id].device);
+        // TODO: Use clever resize semantics to avoid thrashing the memory allocator during a resize
+        // especially important on metal.
+        surface.target_texture = texture;
+        surface.target_view = view;
         surface.config.width = width;
         surface.config.height = height;
         self.configure_surface(surface);
@@ -173,6 +193,29 @@ impl RenderContext {
     }
 }
 
+/// Vello uses a compute shader to render to the provided texture, which means that it can't bind the surface
+/// texture in most cases.
+///
+/// Because of this, we need to create an "intermediate" texture which we render to, and then blit to the surface.
+fn create_targets(width: u32, height: u32, device: &Device) -> (Texture, TextureView) {
+    let target_texture = device.create_texture(&wgpu::TextureDescriptor {
+        label: None,
+        size: wgpu::Extent3d {
+            width,
+            height,
+            depth_or_array_layers: 1,
+        },
+        mip_level_count: 1,
+        sample_count: 1,
+        dimension: wgpu::TextureDimension::D2,
+        usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::TEXTURE_BINDING,
+        format: TextureFormat::Rgba8Unorm,
+        view_formats: &[],
+    });
+    let target_view = target_texture.create_view(&wgpu::TextureViewDescriptor::default());
+    (target_texture, target_view)
+}
+
 impl DeviceHandle {
     /// Returns the adapter associated with the device.
     pub fn adapter(&self) -> &Adapter {
@@ -181,12 +224,28 @@ impl DeviceHandle {
 }
 
 /// Combination of surface and its configuration.
-#[derive(Debug)]
 pub struct RenderSurface<'s> {
     pub surface: Surface<'s>,
     pub config: SurfaceConfiguration,
     pub dev_id: usize,
     pub format: TextureFormat,
+    pub target_texture: Texture,
+    pub target_view: TextureView,
+    pub blitter: TextureBlitter,
+}
+
+impl std::fmt::Debug for RenderSurface<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RenderSurface")
+            .field("surface", &self.surface)
+            .field("config", &self.config)
+            .field("dev_id", &self.dev_id)
+            .field("format", &self.format)
+            .field("target_texture", &self.target_texture)
+            .field("target_view", &self.target_view)
+            .field("blitter", &"(Not Debug)")
+            .finish()
+    }
 }
 
 struct NullWake;
diff --git a/vello/src/wgpu_engine.rs b/vello/src/wgpu_engine.rs
index 8f5e1c69..7f2d0813 100644
--- a/vello/src/wgpu_engine.rs
+++ b/vello/src/wgpu_engine.rs
@@ -299,6 +299,10 @@ impl WgpuEngine {
         })
     }
 
+    #[expect(
+        unused,
+        reason = "Used by debug layers, which have been temporarily removed."
+    )]
     pub fn add_render_shader(
         &mut self,
         device: &Device,
@@ -738,10 +742,18 @@ impl WgpuEngine {
         Ok(())
     }
 
+    #[expect(
+        unused,
+        reason = "Used by debug layers, which have been temporarily removed."
+    )]
     pub fn get_download(&self, buf: BufferProxy) -> Option<&Buffer> {
         self.downloads.get(&buf.id)
     }
 
+    #[expect(
+        unused,
+        reason = "Used by debug layers, which have been temporarily removed."
+    )]
     pub fn free_download(&mut self, buf: BufferProxy) {
         self.downloads.remove(&buf.id);
     }
diff --git a/vello_tests/src/lib.rs b/vello_tests/src/lib.rs
index 2ca78178..d7ce8ad0 100644
--- a/vello_tests/src/lib.rs
+++ b/vello_tests/src/lib.rs
@@ -115,7 +115,6 @@ pub async fn get_scene_image(params: &TestParams, scene: &Scene) -> Result<Image
     let mut renderer = vello::Renderer::new(
         device,
         RendererOptions {
-            surface_format: None,
             use_cpu: params.use_cpu,
             num_init_threads: NonZeroUsize::new(1),
             antialiasing_support: std::iter::once(params.anti_aliasing).collect(),