From 53f6869b19a52a18807d68b5cf38a313338fbcad Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Thu, 16 Jan 2025 01:24:46 +0100 Subject: [PATCH 1/2] Fix Shuffle-bug codegen for GPU_Codegen_C and add test. --- src/CodeGen_GPU_Dev.cpp | 53 +++++++++++++++++++++++----- test/correctness/CMakeLists.txt | 1 + test/correctness/shuffle.cpp | 62 +++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 9 deletions(-) create mode 100644 test/correctness/shuffle.cpp diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp index 07148a508144..ab27e1c41c57 100644 --- a/src/CodeGen_GPU_Dev.cpp +++ b/src/CodeGen_GPU_Dev.cpp @@ -149,7 +149,12 @@ void CodeGen_GPU_C::visit(const Shuffle *op) { internal_assert(op->vectors[0].type() == op->vectors[i].type()); } internal_assert(op->type.lanes() == (int)op->indices.size()); - const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size()); + std::vector vector_first_index; + int max_index = 0; + for (const Expr &v : op->vectors) { + vector_first_index.push_back(max_index); + max_index += v.type().lanes(); + } for (int i : op->indices) { internal_assert(i >= 0 && i < max_index); } @@ -162,25 +167,55 @@ void CodeGen_GPU_C::visit(const Shuffle *op) { std::string src = vecs[0]; std::ostringstream rhs; std::string storage_name = unique_name('_'); - if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) { + switch (vector_declaration_style) { + case VectorDeclarationStyle::OpenCLSyntax: rhs << "(" << print_type(op->type) << ")("; - } else if (vector_declaration_style == VectorDeclarationStyle::WGSLSyntax) { + break; + case VectorDeclarationStyle::WGSLSyntax: rhs << print_type(op->type) << "("; - } else { + break; + case VectorDeclarationStyle::CLikeSyntax: rhs << "{"; + break; } + int elem_num = 0; for (int i : op->indices) { - rhs << vecs[i]; - if (i < (int)(op->indices.size() - 1)) { + size_t vector_idx; + int lane_idx = -1; + for (vector_idx = 0; vector_idx < op->vectors.size(); ++vector_idx) { + if (i >= vector_first_index[vector_idx] && i < vector_first_index[vector_idx] + op->vectors[vector_idx].type().lanes()) { + lane_idx = i - vector_first_index[vector_idx]; + break; + } + } + internal_assert(lane_idx != -1) << "Shuffle lane index not found."; + rhs << vecs[vector_idx]; + if (op->vectors[vector_idx].type().lanes() > 1) { + switch (vector_declaration_style) { + case VectorDeclarationStyle::OpenCLSyntax: + rhs << ".s" << lane_idx; + break; + case VectorDeclarationStyle::WGSLSyntax: + case VectorDeclarationStyle::CLikeSyntax: + rhs << "[" << lane_idx << "]"; + break; + } + } + if (elem_num < (int)(op->indices.size() - 1)) { rhs << ", "; } + elem_num++; } - if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) { + switch (vector_declaration_style) { + case VectorDeclarationStyle::OpenCLSyntax: rhs << ")"; - } else if (vector_declaration_style == VectorDeclarationStyle::WGSLSyntax) { + break; + case VectorDeclarationStyle::WGSLSyntax: rhs << ")"; - } else { + break; + case VectorDeclarationStyle::CLikeSyntax: rhs << "}"; + break; } print_assignment(op->type, rhs.str()); } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index d3ca7ead1586..dbaae8c3a8eb 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -276,6 +276,7 @@ tests(GROUPS correctness shared_self_references.cpp shift_by_unsigned_negated.cpp shifted_image.cpp + shuffle.cpp side_effects.cpp simd_op_check_arm.cpp simd_op_check_hvx.cpp diff --git a/test/correctness/shuffle.cpp b/test/correctness/shuffle.cpp new file mode 100644 index 000000000000..6381ddacb929 --- /dev/null +++ b/test/correctness/shuffle.cpp @@ -0,0 +1,62 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + Target target = get_jit_target_from_environment(); + if (target.has_feature(Target::Feature::Vulkan)) { + std::printf("[SKIP] Vulkan seems to be not working.\n"); + return 0; + } + + Var x{"x"}, y{"y"}; + + Func f0{"f0"}, f1{"f1"}, g{"g"}; + f0(x, y) = x * (y + 1); + f1(x, y) = x * (y + 3); + Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)}); + Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)}); + std::vector indices0 = {3, 1, 6, 7, 2, 4, 0, 5}; + std::vector indices1 = {1, 0, 3, 4, 7, 0, 5, 2}; + Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0); + Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1); + Expr result = shuffle1 * shuffle2; + + // Manual logarithmic reduce. + Expr a_half1 = Halide::Internal::Shuffle::make_slice(result, 0, 1, 4); + Expr a_half2 = Halide::Internal::Shuffle::make_slice(result, 4, 1, 4); + Expr a_sumhalves = a_half1 + a_half2; + Expr b_half1 = Halide::Internal::Shuffle::make_slice(a_sumhalves, 0, 1, 2); + Expr b_half2 = Halide::Internal::Shuffle::make_slice(a_sumhalves, 2, 1, 2); + Expr b_sumhalves = b_half1 + b_half2; + g(x) = Internal::Shuffle::make_extract_element(b_sumhalves, 0) + + Internal::Shuffle::make_extract_element(b_sumhalves, 1); + + f0.compute_root(); + f1.compute_root(); + if (target.has_gpu_feature()) { + Var xo, xi; + g.gpu_tile(x, xo, xi, 8).never_partition_all(); + } + + Buffer im = g.realize({32}, target); + im.copy_to_host(); + for (int x = 0; x < 32; x++) { + int fv0[8], fv1[8]; + for (int i = 0; i < 8; ++i) { + fv0[i] = x * (indices0[i] + (indices0[i] >= 4 ? 3 : 1)); + fv1[i] = x * (indices1[i] + (indices1[i] >= 4 ? 3 : 1)); + } + int exp = 0; + for (int i = 0; i < 8; ++i) { + exp += fv0[i] * fv1[i]; + } + if (im(x) != exp) { + printf("im[%d] = %d (expected %d)\n", x, im(x), exp); + return 1; + } + } + printf("Success!\n"); + return 0; +} From eae9647451c8c2efc3ebf47f35e880d46b94b54f Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Thu, 16 Jan 2025 09:55:23 +0100 Subject: [PATCH 2/2] Improve shuffle test to support 4-wide vectors. --- test/correctness/shuffle.cpp | 38 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/test/correctness/shuffle.cpp b/test/correctness/shuffle.cpp index 6381ddacb929..887565093d55 100644 --- a/test/correctness/shuffle.cpp +++ b/test/correctness/shuffle.cpp @@ -17,21 +17,27 @@ int main(int argc, char **argv) { f1(x, y) = x * (y + 3); Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)}); Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)}); - std::vector indices0 = {3, 1, 6, 7, 2, 4, 0, 5}; - std::vector indices1 = {1, 0, 3, 4, 7, 0, 5, 2}; + std::vector indices0; + std::vector indices1; + if (!target.has_gpu_feature() || target.has_feature(Target::Feature::OpenCL) || target.has_feature(Target::Feature::CUDA)) { + indices0 = {3, 1, 6, 7, 2, 4, 0, 5}; + indices1 = {1, 0, 3, 4, 7, 0, 5, 2}; + } else { + indices0 = {3, 1, 6, 7}; + indices1 = {1, 0, 3, 4}; + } Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0); Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1); Expr result = shuffle1 * shuffle2; // Manual logarithmic reduce. - Expr a_half1 = Halide::Internal::Shuffle::make_slice(result, 0, 1, 4); - Expr a_half2 = Halide::Internal::Shuffle::make_slice(result, 4, 1, 4); - Expr a_sumhalves = a_half1 + a_half2; - Expr b_half1 = Halide::Internal::Shuffle::make_slice(a_sumhalves, 0, 1, 2); - Expr b_half2 = Halide::Internal::Shuffle::make_slice(a_sumhalves, 2, 1, 2); - Expr b_sumhalves = b_half1 + b_half2; - g(x) = Internal::Shuffle::make_extract_element(b_sumhalves, 0) + - Internal::Shuffle::make_extract_element(b_sumhalves, 1); + while (result.type().lanes() > 1) { + int half_lanes = result.type().lanes() / 2; + Expr half1 = Halide::Internal::Shuffle::make_slice(result, 0, 1, half_lanes); + Expr half2 = Halide::Internal::Shuffle::make_slice(result, half_lanes, 1, half_lanes); + result = half1 + half2; + } + g(x) = result; f0.compute_root(); f1.compute_root(); @@ -43,14 +49,12 @@ int main(int argc, char **argv) { Buffer im = g.realize({32}, target); im.copy_to_host(); for (int x = 0; x < 32; x++) { - int fv0[8], fv1[8]; - for (int i = 0; i < 8; ++i) { - fv0[i] = x * (indices0[i] + (indices0[i] >= 4 ? 3 : 1)); - fv1[i] = x * (indices1[i] + (indices1[i] >= 4 ? 3 : 1)); - } int exp = 0; - for (int i = 0; i < 8; ++i) { - exp += fv0[i] * fv1[i]; + int halfway = int(indices0.size() / 2); + for (size_t i = 0; i < indices0.size(); ++i) { + int v0 = x * (indices0[i] + (indices0[i] >= halfway ? 3 : 1)); + int v1 = x * (indices1[i] + (indices1[i] >= halfway ? 3 : 1)); + exp += v0 * v1; } if (im(x) != exp) { printf("im[%d] = %d (expected %d)\n", x, im(x), exp);