Merge branch 'main' into xuanda/hlpipe-update

halide · Nov 28, 2023 · dde9213 · dde9213
2 parents 1b7760e + 9ce5fd6
commit dde9213
Show file tree

Hide file tree

Showing 27 changed files with 854 additions and 298 deletions.
diff --git a/README_webgpu.md b/README_webgpu.md
@@ -36,7 +36,7 @@ When invoking `emcc` to link Halide-generated objects, include these flags:
 `-s USE_WEBGPU=1 -s ASYNCIFY`.
 
 Tests that use AOT compilation can be run using a native WebGPU implementation
-that has Node.js bindings, such as [Dawn](dawn.googlesource.com/dawn/).
+that has Node.js bindings, such as [Dawn](https://dawn.googlesource.com/dawn/).
 You must set an environment variable named `HL_WEBGPU_NODE_BINDINGS` that
 has an absolute path to the bindings to run these tests, e.g. `HL_WEBGPU_NODE_BINDINGS=/path/to/dawn.node`.
 
@@ -47,13 +47,18 @@ JIT compilation is not supported when using WebGPU with WASM.
 
 ## Running natively: `HL_TARGET=host-webgpu`
 
-> _Tested with top-of-tree Dawn as of 2023-03-14._
+> _Tested with top-of-tree Dawn as of 2023-11-27 [commit b5d38fc7dc2a20081312c95e379c4a918df8b7d4]._
 
 For testing purposes, Halide can also target native WebGPU libraries, such as
-[Dawn](dawn.googlesource.com/dawn/) or [wgpu](github.com/gfx-rs/wgpu).
+[Dawn](https://dawn.googlesource.com/dawn/) or
+[wgpu](https://github.com/gfx-rs/wgpu).
 This is currently the only path that can run the JIT correctness tests.
 See [below](#setting-up-dawn) for instructions on building Dawn.
 
+> Note that as of 2023-11-27, wgpu is not supported due to
+> [lacking `override` support for WGSL](https://github.com/gfx-rs/wgpu/issues/1762)
+> which we require > in order to set GPU block sizes.
+
 When targeting WebGPU with a native target, Halide defaults to looking for a
 build of Dawn (with several common names and suffixes); you can override this
 by setting the `HL_WEBGPU_NATIVE_LIB` environment variable to the absolute path
@@ -71,7 +76,7 @@ will be selected based on the Halide target specified.
 
 Building Dawn's Node.js bindings currently requires using CMake.
 
-First, [install `depot_tools`](commondatastorage.googleapis.com/chrome-infra-docs/flat/depot_tools/docs/html/depot_tools_tutorial.html#_setting_up) and add it to the
+First, [install `depot_tools`](https://commondatastorage.googleapis.com/chrome-infra-docs/flat/depot_tools/docs/html/depot_tools_tutorial.html#_setting_up) and add it to the
 `PATH` environment variable.
 
 Next, get Dawn and its dependencies:
@@ -108,3 +113,16 @@ This will produce the following artifacts:
 
 These paths can then be used for the `HL_WEBGPU_NODE_BINDINGS` and
 `HL_WEBGPU_NATIVE_LIB` environment variables when using Halide.
+
+## Updating mini_webgpu.h
+
+The recommended method for updating `mini_webgpu.h` is to copy the
+`gen/include/dawn/webgpu.h` file from the Dawn build directory, then:
+- Restore the `// clang-format {off,on}` lines.
+- Comment out the `#include <std*>` lines.
+- Remove the `void` parameter from the `WGPUProc` declaration.
+
+This guarantees a version of the WebGPU header that is compatible with Dawn.
+When the native API eventually stabilizes, it should be possible to obtain a
+header from the `webgpu-native` GitHub organization that will be compatible
+with Dawn, wgpu, and Emscripten.
diff --git a/apps/interpolate/Makefile b/apps/interpolate/Makefile
@@ -1,6 +1,7 @@
 include ../support/Makefile.inc
 
 .PHONY: build clean test
+.SECONDARY:
 
 build: $(BIN)/$(HL_TARGET)/filter
 

diff --git a/apps/interpolate/interpolate_generator.cpp b/apps/interpolate/interpolate_generator.cpp
@@ -79,6 +79,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
             Var yo, yi, xo, xi, ci, xii, yii;
             if (get_target().has_gpu_feature()) {
                 normalize
+                    .never_partition_all()
                     .bound(x, 0, input.width())
                     .bound(y, 0, input.height())
                     .bound(c, 0, 3)
@@ -94,6 +95,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                 for (int l = 1; l < levels; l++) {
                     downsampled[l]
                         .compute_root()
+                        .never_partition_all()
                         .reorder(c, x, y)
                         .unroll(c)
                         .gpu_tile(x, y, xi, yi, 16, 16);
@@ -102,6 +104,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                 for (int l = 3; l < levels; l += 2) {
                     interpolated[l]
                         .compute_root()
+                        .never_partition_all()
                         .reorder(c, x, y)
                         .tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
                         .tile(xi, yi, xii, yii, 2, 2)
@@ -114,6 +117,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 upsampledx[1]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .tile(x, y, xi, yi, 2, 1)
                     .unroll(xi)
@@ -123,6 +127,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 interpolated[1]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .tile(x, y, xi, yi, 2, 2)
                     .unroll(xi)
@@ -132,6 +137,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
 
                 interpolated[2]
                     .compute_at(normalize, x)
+                    .never_partition_all()
                     .reorder(c, x, y)
                     .unroll(c)
                     .gpu_threads(x, y);
@@ -148,6 +154,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     // the local_laplacian app.
                     downsampled[l]
                         .compute_root()
+                        .never_partition(x)
                         .reorder(x, c, y)
                         .split(y, yo, yi, 8)
                         .parallel(yo)
@@ -165,12 +172,14 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     .compute_at(downsampled[1], yi)
                     .reorder(c, x, y)
                     .unroll(c)
-                    .vectorize(x, vec);
+                    .vectorize(x, vec)
+                    .never_partition(y);
 
                 normalize
                     .bound(x, 0, input.width())
                     .bound(y, 0, input.height())
                     .bound(c, 0, 3)
+                    .never_partition(y)
                     .split(x, xo, xi, vec)
                     .split(y, yo, yi, 32)
                     .reorder(xi, c, xo, yi, yo)
@@ -182,6 +191,7 @@ class Interpolate : public Halide::Generator<Interpolate> {
                     interpolated[l]
                         .store_at(normalize, yo)
                         .compute_at(normalize, yi)
+                        .never_partition_all()
                         .vectorize(x, vec);
                 }
 

diff --git a/apps/local_laplacian/Makefile b/apps/local_laplacian/Makefile
@@ -1,6 +1,7 @@
 include ../support/Makefile.inc
 
 .PHONY: build clean test
+.SECONDARY:
 
 build: $(BIN)/$(HL_TARGET)/process
 

diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -81,10 +81,10 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
         Func color;
         float eps = 0.01f;
-        color(x, y, c) = outGPyramid[0](x, y) * (floating(x, y, c) + eps) / (gray(x, y) + eps);
+        color(x, y, c) = input(x, y, c) * (outGPyramid[0](x, y) + eps) / (gray(x, y) + eps);
 
         // Convert back to 16-bit
-        output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);
+        output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 65535.0f));
 
         /* ESTIMATES */
         // (This can be useful in conjunction with RunGen and benchmarks as well
@@ -102,21 +102,36 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
             // Nothing.
         } else if (get_target().has_gpu_feature()) {
             // GPU schedule.
-            // 3.19ms on an RTX 2060.
+            // 2.9ms on an RTX 2060.
+
+            // All loop partitioning disabled, which has no effect on runtime,
+            // but saves 15% compile time and 45% ptx shader code size.
             remap.compute_root();
             Var xi, yi;
-            output.compute_root().gpu_tile(x, y, xi, yi, 16, 8);
+            output.compute_root()
+                .never_partition_all()
+                .gpu_tile(x, y, xi, yi, 16, 8);
             for (int j = 0; j < J; j++) {
                 int blockw = 16, blockh = 8;
                 if (j > 3) {
                     blockw = 2;
                     blockh = 2;
                 }
                 if (j > 0) {
-                    inGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
-                    gPyramid[j].compute_root().reorder(k, x, y).gpu_tile(x, y, xi, yi, blockw, blockh);
+                    inGPyramid[j]
+                        .compute_root()
+                        .never_partition_all()
+                        .gpu_tile(x, y, xi, yi, blockw, blockh);
+                    gPyramid[j]
+                        .compute_root()
+                        .reorder(k, x, y)
+                        .never_partition_all()
+                        .gpu_tile(x, y, xi, yi, blockw, blockh);
                 }
-                outGPyramid[j].compute_root().gpu_tile(x, y, xi, yi, blockw, blockh);
+                outGPyramid[j]
+                    .compute_root()
+                    .never_partition_all()
+                    .gpu_tile(x, y, xi, yi, blockw, blockh);
             }
         } else {
             // CPU schedule.
@@ -131,8 +146,16 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
 
             remap.compute_root();
             Var yo;
-            output.reorder(c, x, y).split(y, yo, y, 64).parallel(yo).vectorize(x, 8);
-            gray.compute_root().parallel(y, 32).vectorize(x, 8);
+            output
+                .reorder(c, x, y)
+                .split(y, yo, y, 64)
+                .parallel(yo)
+                .vectorize(x, 8);
+            gray
+                .compute_root()
+                .never_partition(y)
+                .parallel(y, 32)
+                .vectorize(x, 8);
             for (int j = 1; j < 5; j++) {
                 inGPyramid[j]
                     .compute_root()
@@ -148,12 +171,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
                     .store_at(output, yo)
                     .compute_at(output, y)
                     .fold_storage(y, 4)
-                    .vectorize(x, 8);
+                    .vectorize(x, 8, TailStrategy::RoundUp);
+                if (j > 1) {
+                    // Turn off loop partitioning at higher pyramid levels. This
+                    // shaves about 3% off code size and compile time without
+                    // affecting performance.
+                    inGPyramid[j].never_partition_all();
+                    gPyramid[j].never_partition_all();
+                }
             }
             outGPyramid[0]
                 .compute_at(output, y)
                 .hoist_storage(output, yo)
-                .vectorize(x, 8);
+                .vectorize(x, 8, TailStrategy::RoundUp);
             for (int j = 5; j < J; j++) {
                 inGPyramid[j].compute_root();
                 gPyramid[j].compute_root().parallel(k);

diff --git a/src/BoundaryConditions.cpp b/src/BoundaryConditions.cpp
@@ -66,11 +66,11 @@ Func constant_exterior(const Func &source, const Tuple &value,
     if (value.as_vector().size() > 1) {
         std::vector<Expr> def;
         for (size_t i = 0; i < value.as_vector().size(); i++) {
-            def.push_back(select(out_of_bounds, value[i], repeat_edge(source, bounds)(args)[i]));
+            def.push_back(select(out_of_bounds, value[i], likely(repeat_edge(source, bounds)(args)[i])));
         }
         bounded(args) = Tuple(def);
     } else {
-        bounded(args) = select(out_of_bounds, value[0], repeat_edge(source, bounds)(args));
+        bounded(args) = select(out_of_bounds, value[0], likely(repeat_edge(source, bounds)(args)));
     }
 
     return bounded;
@@ -99,10 +99,25 @@ Func repeat_image(const Func &source,
             Expr coord = arg_var - min;  // Enforce zero origin.
             coord = coord % extent;      // Range is 0 to w-1
             coord = coord + min;         // Restore correct min
-
             coord = select(arg_var < min || arg_var >= min + extent, coord,
-                           clamp(likely(arg_var), min, min + extent - 1));
-
+                           likely(clamp(likely(arg_var), min, min + extent - 1)));
+
+            // In the line above, we want loop partitioning to both cause the
+            // clamp to go away, and also cause the select to go away. For loop
+            // partitioning to make one of these constructs go away we need one
+            // of two things to be true:
+            //
+            // 1) One arg has a likely intrinsic buried somewhere within it, and
+            //    the other arg doesn't.
+            // 2) Both args have likely intrinsics, but in one of the args it is
+            //    not within any inner min/max/select node. This is called an
+            //    'uncaptured' likely.
+            //
+            // The issue with this boundary condition is that the true branch of
+            // the select (coord) may well have a likely within it somewhere
+            // introduced by a loop tail strategy, so condition 1 doesn't
+            // hold. To be more robust, we make condition 2 hold, by introducing
+            // an uncaptured likely to the false branch.
             actuals.push_back(coord);
         } else if (!min.defined() && !extent.defined()) {
             actuals.push_back(arg_var);
@@ -143,7 +158,7 @@ Func mirror_image(const Func &source,
             coord = coord + min;                                             // Restore correct min
             coord = clamp(coord, min, min + extent - 1);
             coord = select(arg_var < min || arg_var >= min + extent, coord,
-                           clamp(likely(arg_var), min, min + extent - 1));
+                           likely(clamp(likely(arg_var), min, min + extent - 1)));
             actuals.push_back(coord);
         } else if (!min.defined() && !extent.defined()) {
             actuals.push_back(arg_var);
@@ -188,7 +203,7 @@ Func mirror_interior(const Func &source,
 
             // The boundary condition probably doesn't apply
             coord = select(arg_var < min || arg_var >= min + extent, coord,
-                           clamp(likely(arg_var), min, min + extent - 1));
+                           likely(clamp(likely(arg_var), min, min + extent - 1)));
 
             actuals.push_back(coord);
         } else if (!min.defined() && !extent.defined()) {

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -1354,8 +1354,7 @@ void CodeGen_ARM::visit(const Call *op) {
             if (value) {
                 return;
             }
-        } else if (target.os != Target::Linux) {
-            // Furthermore, roundevenf isn't always in the standard library on arm-32
+        } else {
             value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
             return;
         }

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1122,7 +1122,7 @@ void CodeGen_LLVM::optimize_module() {
     PipelineTuningOptions pto;
     pto.LoopInterleaving = do_loop_opt;
     pto.LoopVectorization = do_loop_opt;
-    pto.SLPVectorization = use_slp_vectorization();
+    pto.SLPVectorization = true;
     pto.LoopUnrolling = do_loop_opt;
     // Clear ScEv info for all loops. Certain Halide applications spend a very
     // long time compiling in forgetLoop, and prefer to forget everything

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -127,13 +127,6 @@ class CodeGen_LLVM : public IRVisitor {
     virtual bool use_pic() const;
     // @}
 
-    /** Should SLP vectorization be turned on in LLVM? SLP vectorization has no
-     * analogue in the Halide scheduling model so this is decided heuristically
-     * depending on the target. */
-    virtual bool use_slp_vectorization() const {
-        return true;
-    }
-
     /** Should indexing math be promoted to 64-bit on platforms with
      * 64-bit pointers? */
     virtual bool promote_indices() const {
@@ -402,11 +395,6 @@ class CodeGen_LLVM : public IRVisitor {
     void visit(const Realize *) override;
     // @}
 
-    /** If we have to bail out of a pipeline midway, this should
-     * inject the appropriate target-specific cleanup code. */
-    virtual void prepare_for_early_exit() {
-    }
-
     /** Get the llvm type equivalent to the given halide type in the
      * current context. */
     virtual llvm::Type *llvm_type_of(const Type &) const;