Skip to content

Commit

Permalink
Merge pull request #1717 from CEED/jeremy/fix-hip-shared-atpoints
Browse files Browse the repository at this point in the history
Fix hip shared atpoints
  • Loading branch information
jeremylt authored Dec 11, 2024
2 parents f4112a4 + b4280a9 commit 5a7f61c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
16 changes: 8 additions & 8 deletions backends/cuda-shared/ceed-cuda-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};

if (dim == 1) {
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
1)); // avoid >512 total threads
// avoid >512 total threads
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);

Expand Down Expand Up @@ -113,8 +113,8 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
}
void *grad_args[] = {(void *)&num_elem, &data->c_B, &data->c_G, &d_u, &d_v};
if (dim == 1) {
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
1)); // avoid >512 total threads
// avoid >512 total threads
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);

Expand Down Expand Up @@ -332,8 +332,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v};

if (dim == 1) {
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
1)); // avoid >512 total threads
// avoid >512 total threads
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);

Expand Down Expand Up @@ -368,8 +368,8 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};

if (dim == 1) {
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
1)); // avoid >512 total threads
// avoid >512 total threads
CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar);

Expand Down
4 changes: 2 additions & 2 deletions backends/hip-shared/ceed-hip-shared-basis.c
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
thread_1d, elems_per_block, shared_mem, interp_args));
} else if (dim == 3) {
const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
const CeedInt elems_per_block = 1;
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);

Expand Down Expand Up @@ -440,7 +440,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
elems_per_block, shared_mem, grad_args));
} else if (dim == 3) {
const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
const CeedInt elems_per_block = 1;
CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);

Expand Down

0 comments on commit 5a7f61c

Please sign in to comment.