Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix horrible Shuffle bug in GPU_C_Codegen and add test. #8553

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 44 additions & 9 deletions src/CodeGen_GPU_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,12 @@ void CodeGen_GPU_C::visit(const Shuffle *op) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
std::vector<int> vector_first_index;
int max_index = 0;
for (const Expr &v : op->vectors) {
vector_first_index.push_back(max_index);
max_index += v.type().lanes();
}
for (int i : op->indices) {
internal_assert(i >= 0 && i < max_index);
}
Expand All @@ -162,25 +167,55 @@ void CodeGen_GPU_C::visit(const Shuffle *op) {
std::string src = vecs[0];
std::ostringstream rhs;
std::string storage_name = unique_name('_');
if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) {
switch (vector_declaration_style) {
case VectorDeclarationStyle::OpenCLSyntax:
rhs << "(" << print_type(op->type) << ")(";
} else if (vector_declaration_style == VectorDeclarationStyle::WGSLSyntax) {
break;
case VectorDeclarationStyle::WGSLSyntax:
rhs << print_type(op->type) << "(";
} else {
break;
case VectorDeclarationStyle::CLikeSyntax:
rhs << "{";
break;
}
int elem_num = 0;
for (int i : op->indices) {
rhs << vecs[i];
if (i < (int)(op->indices.size() - 1)) {
size_t vector_idx;
int lane_idx = -1;
for (vector_idx = 0; vector_idx < op->vectors.size(); ++vector_idx) {
if (i >= vector_first_index[vector_idx] && i < vector_first_index[vector_idx] + op->vectors[vector_idx].type().lanes()) {
lane_idx = i - vector_first_index[vector_idx];
break;
}
}
internal_assert(lane_idx != -1) << "Shuffle lane index not found.";
rhs << vecs[vector_idx];
if (op->vectors[vector_idx].type().lanes() > 1) {
switch (vector_declaration_style) {
case VectorDeclarationStyle::OpenCLSyntax:
rhs << ".s" << lane_idx;
break;
case VectorDeclarationStyle::WGSLSyntax:
case VectorDeclarationStyle::CLikeSyntax:
rhs << "[" << lane_idx << "]";
break;
}
}
if (elem_num < (int)(op->indices.size() - 1)) {
rhs << ", ";
}
elem_num++;
}
if (vector_declaration_style == VectorDeclarationStyle::OpenCLSyntax) {
switch (vector_declaration_style) {
case VectorDeclarationStyle::OpenCLSyntax:
rhs << ")";
} else if (vector_declaration_style == VectorDeclarationStyle::WGSLSyntax) {
break;
case VectorDeclarationStyle::WGSLSyntax:
rhs << ")";
} else {
break;
case VectorDeclarationStyle::CLikeSyntax:
rhs << "}";
break;
}
print_assignment(op->type, rhs.str());
}
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ tests(GROUPS correctness
shared_self_references.cpp
shift_by_unsigned_negated.cpp
shifted_image.cpp
shuffle.cpp
side_effects.cpp
simd_op_check_arm.cpp
simd_op_check_hvx.cpp
Expand Down
66 changes: 66 additions & 0 deletions test/correctness/shuffle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "Halide.h"
#include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) {
Target target = get_jit_target_from_environment();
if (target.has_feature(Target::Feature::Vulkan)) {
std::printf("[SKIP] Vulkan seems to be not working.\n");
return 0;
}

Var x{"x"}, y{"y"};

Func f0{"f0"}, f1{"f1"}, g{"g"};
f0(x, y) = x * (y + 1);
f1(x, y) = x * (y + 3);
Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)});
Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)});
std::vector<int> indices0;
std::vector<int> indices1;
if (!target.has_gpu_feature() || target.has_feature(Target::Feature::OpenCL) || target.has_feature(Target::Feature::CUDA)) {
indices0 = {3, 1, 6, 7, 2, 4, 0, 5};
indices1 = {1, 0, 3, 4, 7, 0, 5, 2};
} else {
indices0 = {3, 1, 6, 7};
indices1 = {1, 0, 3, 4};
}
Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0);
Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1);
Expr result = shuffle1 * shuffle2;

// Manual logarithmic reduce.
while (result.type().lanes() > 1) {
int half_lanes = result.type().lanes() / 2;
Expr half1 = Halide::Internal::Shuffle::make_slice(result, 0, 1, half_lanes);
Expr half2 = Halide::Internal::Shuffle::make_slice(result, half_lanes, 1, half_lanes);
result = half1 + half2;
}
g(x) = result;

f0.compute_root();
f1.compute_root();
if (target.has_gpu_feature()) {
Var xo, xi;
g.gpu_tile(x, xo, xi, 8).never_partition_all();
}

Buffer<int> im = g.realize({32}, target);
im.copy_to_host();
for (int x = 0; x < 32; x++) {
int exp = 0;
int halfway = int(indices0.size() / 2);
for (size_t i = 0; i < indices0.size(); ++i) {
int v0 = x * (indices0[i] + (indices0[i] >= halfway ? 3 : 1));
int v1 = x * (indices1[i] + (indices1[i] >= halfway ? 3 : 1));
exp += v0 * v1;
}
if (im(x) != exp) {
printf("im[%d] = %d (expected %d)\n", x, im(x), exp);
return 1;
}
}
printf("Success!\n");
return 0;
}