Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix hip/shared NonTensor #1724

Merged
merged 1 commit into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions backends/hip-shared/ceed-hip-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
CeedInt thread = CeedIntMax(Q, P);
void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};

{
CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
Expand All @@ -554,7 +554,7 @@ static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_ad
CeedInt block_size = data->block_sizes[2];

CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q));
CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};

{
Expand Down Expand Up @@ -723,8 +723,10 @@ int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num
const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";

CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp));
CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
data->block_sizes[0]));
CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
Expand Down
21 changes: 11 additions & 10 deletions include/ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ inline __device__ void ContractTranspose1d(SharedData_Hip &data, const CeedScala
// Interpolate to quadrature points
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
inline __device__ void InterpNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
}
Expand All @@ -54,8 +55,8 @@ inline __device__ void Interp1d(SharedData_Hip &data, const CeedScalar *__restri
// Interpolate transpose
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
inline __device__ void InterpTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
r_V[comp] = 0.0;
ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
Expand All @@ -65,8 +66,8 @@ inline __device__ void InterpTranspose1d(SharedData_Hip &data, const CeedScalar
//------------------------------------------------------------------------------
// Derivatives at quadrature points
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
template <int NUM_COMP, int DIM, int P, int Q>
inline __device__ void GradNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
for (CeedInt dim = 0; dim < DIM; dim++) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
Expand All @@ -77,9 +78,9 @@ inline __device__ void Grad1d(SharedData_Hip &data, const CeedScalar *__restrict
//------------------------------------------------------------------------------
// Derivatives transpose
//------------------------------------------------------------------------------
template <int NUM_COMP, int P, int Q>
inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
CeedScalar *__restrict__ r_V) {
template <int NUM_COMP, int DIM, int P, int Q>
inline __device__ void GradTransposeNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
CeedScalar *__restrict__ r_V) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
for (CeedInt dim = 0; dim < DIM; dim++) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Expand All @@ -92,6 +93,6 @@ inline __device__ void GradTranspose1d(SharedData_Hip &data, const CeedScalar *_
// Quadrature weights
//------------------------------------------------------------------------------
template <int Q>
inline __device__ void Weight1d(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
*w = (data.t_id_x < Q) ? q_weight_1d[data.t_id_x] : 0.0;
inline __device__ void WeightNonTensor(SharedData_Hip &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
*w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
}
27 changes: 13 additions & 14 deletions include/ceed/jit-source/hip/hip-shared-basis-nontensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
//------------------------------------------------------------------------------
// Grad kernels
//------------------------------------------------------------------------------
extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
extern __shared__ CeedScalar slice[];

Expand All @@ -114,8 +114,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
Expand All @@ -126,7 +126,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
}
}

extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
extern __shared__ CeedScalar slice[];

Expand All @@ -141,8 +141,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
CeedScalar r_V[BASIS_NUM_COMP];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
Expand All @@ -153,24 +153,23 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
}
}

extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
CeedScalar *__restrict__ d_V) {
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
extern __shared__ CeedScalar slice[];

SharedData_Hip data;
data.t_id_x = threadIdx.x;
data.t_id_y = threadIdx.y;
data.t_id_z = threadIdx.z;
data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
data.slice = &slice[data.t_id_z * T_1D];
data.slice = slice + data.t_id_z * T_1D;

CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
CeedScalar r_V[BASIS_NUM_COMP];

// load grad into shared memory
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q];
LoadMatrix<BASIS_P, BASIS_Q>(data, c_G, s_G);
__shared__ CeedScalar s_G[BASIS_P * BASIS_Q * BASIS_DIM];
LoadMatrix<BASIS_P, BASIS_Q * BASIS_DIM>(data, c_G, s_G);
__syncthreads();

// Apply basis element by element
Expand All @@ -184,8 +183,8 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
//------------------------------------------------------------------------------
// Weight kernel
//------------------------------------------------------------------------------
extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
extern __shared__ CeedScalar slice[];

SharedData_Hip data;
Expand Down