diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 7023544b1b..10806a0ff0 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -268,7 +268,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedVector v) { Ceed ceed; Ceed_Magma *data; - CeedInt dim, num_comp, num_nodes, num_qpts, P, Q, N; + CeedInt num_comp, q_comp, num_nodes, num_qpts, P, Q, N; const CeedScalar *d_u; CeedScalar *d_v; CeedBasisNonTensor_Magma *impl; @@ -276,8 +276,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedGetData(ceed, &data)); CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedCallBackend(CeedBasisGetDimension(basis, &dim)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp)); CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); P = num_nodes; @@ -304,7 +304,30 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed // Apply basis operation if (e_mode != CEED_EVAL_WEIGHT) { - if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { + const CeedScalar *d_b = NULL; + switch (e_mode) { + case CEED_EVAL_INTERP: + d_b = impl->d_interp; + break; + case CEED_EVAL_GRAD: + d_b = impl->d_grad; + break; + case CEED_EVAL_DIV: + d_b = impl->d_div; + break; + case CEED_EVAL_CURL: + d_b = impl->d_curl; + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT does not make sense in this context"); + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); + // LCOV_EXCL_STOP + } + + // Apply basis operation + if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES}; CeedInt iN = 0, diff = abs(n_array[iN] - N), idiff; CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P; @@ -319,92 +342,85 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed // Compile kernels for N as needed if (!impl->NB_interp[iN]) { + CeedFESpace fe_space; + CeedInt q_comp_interp, q_comp_deriv; Ceed ceed_delegate; - char *interp_kernel_path, *grad_kernel_path, *basis_kernel_source; + char *basis_kernel_path, *basis_kernel_source; magma_int_t arch = magma_getdevice_arch(); // Tuning parameters for NB - impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', 1, P, Q, n_array[iN]); - impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', 1, P, Q, n_array[iN]); - impl->NB_grad[iN] = nontensor_rtc_get_nb(arch, 'n', dim, P, Q, n_array[iN]); - impl->NB_grad_t[iN] = nontensor_rtc_get_nb(arch, 't', dim, P, Q, n_array[iN]); + CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + switch (fe_space) { + case CEED_FE_SPACE_H1: + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv)); + break; + case CEED_FE_SPACE_HDIV: + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv)); + break; + case CEED_FE_SPACE_HCURL: + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv)); + break; + } + impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]); + impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]); + impl->NB_deriv[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]); + impl->NB_deriv_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]); // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); // Compile kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-nontensor.h", &interp_kernel_path)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, interp_kernel_path, &basis_kernel_source)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-grad-nontensor.h", &grad_kernel_path)); - CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_kernel_path, &basis_kernel_source)); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 7, "BASIS_DIM", dim, "BASIS_P", P, "BASIS_Q", - Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_GRAD_N", - impl->NB_grad[iN], "BASIS_NB_GRAD_T", impl->NB_grad_t[iN])); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp, + "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], + "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T", + impl->NB_deriv_t[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_n", &impl->Grad[iN])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_grad_nontensor_t", &impl->GradTranspose[iN])); - CeedCallBackend(CeedFree(&interp_kernel_path)); - CeedCallBackend(CeedFree(&grad_kernel_path)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); + CeedCallBackend(CeedFree(&basis_kernel_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); } - - // Apply basis operation - CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D); + CeedMagmaFunction Kernel; + CeedInt NB; if (e_mode == CEED_EVAL_INTERP) { - CeedInt NB = (t_mode == CEED_TRANSPOSE) ? impl->NB_interp_t[iN] : impl->NB_interp[iN]; - CeedInt grid = CeedDivUpInt(N, NB * num_t_col); - CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); - CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar); - CeedInt shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); - void *args[] = {&N, &impl->d_interp, &P, &d_u, &K, &d_v, &M}; - if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->InterpTranspose[iN], grid, M, num_t_col, 1, shared_mem, args)); + Kernel = impl->InterpTranspose[iN]; + NB = impl->NB_interp_t[iN]; } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Interp[iN], grid, M, num_t_col, 1, shared_mem, args)); + Kernel = impl->Interp[iN]; + NB = impl->NB_interp[iN]; } - } else if (e_mode == CEED_EVAL_GRAD) { - CeedInt NB = (t_mode == CEED_TRANSPOSE) ? impl->NB_grad_t[iN] : impl->NB_grad[iN]; - CeedInt grid = CeedDivUpInt(N, NB * num_t_col); - CeedInt shared_mem = num_t_col * K * NB * sizeof(CeedScalar) + ((t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar)); - void *args[] = {&N, &impl->d_grad, &P, &d_u, &K, &d_v, &M}; - + } else { if (t_mode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->GradTranspose[iN], grid, M, num_t_col, 1, shared_mem, args)); + Kernel = impl->DerivTranspose[iN]; + NB = impl->NB_deriv_t[iN]; } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Grad[iN], grid, M, num_t_col, 1, shared_mem, args)); + Kernel = impl->Deriv[iN]; + NB = impl->NB_deriv[iN]; } - } else { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported"); - // LCOV_EXCL_STOP } + CeedInt num_t_col = MAGMA_BASIS_NTCOL(M, MAGMA_MAXTHREADS_1D); + CeedInt grid = CeedDivUpInt(N, num_t_col * NB); + CeedInt shared_mem_A = (t_mode == CEED_TRANSPOSE) ? 0 : P * Q * sizeof(CeedScalar); + CeedInt shared_mem_B = num_t_col * K * NB * sizeof(CeedScalar); + CeedInt shared_mem = (t_mode == CEED_TRANSPOSE || q_comp > 1) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); + void *args[] = {&N, &d_b, &d_u, &d_v}; + + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, Kernel, grid, M, num_t_col, 1, shared_mem, args)); } else { - if (e_mode == CEED_EVAL_INTERP) { + for (CeedInt d = 0; d < q_comp; d++) { if (t_mode == CEED_TRANSPOSE) { - magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_interp, P, d_u, Q, 0.0, d_v, P, data->queue); + const CeedScalar beta = (d > 0) ? 1.0 : 0.0; + magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, d_b + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, data->queue); } else { - magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_interp, P, d_u, P, 0.0, d_v, Q, data->queue); + magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, d_b + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); } - } else if (e_mode == CEED_EVAL_GRAD) { - if (t_mode == CEED_TRANSPOSE) { - for (int d = 0; d < dim; d++) { - const CeedScalar beta = (d > 0) ? 1.0 : 0.0; - magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, N, Q, 1.0, impl->d_grad + d * P * Q, P, d_u + d * N * Q, Q, beta, d_v, P, - data->queue); - } - } else { - for (int d = 0; d < dim; d++) { - magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, N, P, 1.0, impl->d_grad + d * P * Q, P, d_u, P, 0.0, d_v + d * N * Q, Q, data->queue); - } - } - } else { - // LCOV_EXCL_START - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV, CEED_EVAL_CURL not supported"); - // LCOV_EXCL_STOP } } } else { @@ -412,7 +428,7 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedInt num_t_col = MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D); CeedInt grid = CeedDivUpInt(num_elem, num_t_col); CeedInt shared_mem = Q * sizeof(CeedScalar) + num_t_col * Q * sizeof(CeedScalar); - void *args[] = {&num_elem, &impl->d_q_weight, &d_v, &Q}; + void *args[] = {&num_elem, &impl->d_q_weight, &d_v}; CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->Weight, grid, Q, num_t_col, 1, shared_mem, args)); } @@ -474,6 +490,8 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { } CeedCallBackend(magma_free(impl->d_interp)); CeedCallBackend(magma_free(impl->d_grad)); + CeedCallBackend(magma_free(impl->d_div)); + CeedCallBackend(magma_free(impl->d_curl)); CeedCallBackend(magma_free(impl->d_q_weight)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -590,10 +608,126 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node // Copy basis data to GPU CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); - CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * sizeof(interp[0]))); - magma_setvector(num_qpts * num_nodes, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); - CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * dim * sizeof(grad[0]))); - magma_setvector(num_qpts * num_nodes * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); + if (interp) { + CeedInt q_comp_interp; + + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); + magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); + } + if (grad) { + CeedInt q_comp_grad; + + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); + CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_nodes * q_comp_grad * sizeof(grad[0]))); + magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); + } + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, impl)); + + // Register backend functions + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create non-tensor H(div) +//------------------------------------------------------------------------------ +int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed, ceed_delegate; + Ceed_Magma *data; + char *weight_kernel_path, *basis_kernel_source; + CeedBasisNonTensor_Magma *impl; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedCalloc(1, &impl)); + + // Copy basis data to GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + if (interp) { + CeedInt q_comp_interp; + + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); + magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); + } + if (div) { + CeedInt q_comp_div; + + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); + CeedCallBackend(magma_malloc((void **)&impl->d_div, num_qpts * num_nodes * q_comp_div * sizeof(div[0]))); + magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue); + } + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 2, "BASIS_Q", num_qpts)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + + CeedCallBackend(CeedBasisSetData(basis, impl)); + + // Register backend functions + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ +// Create non-tensor H(curl) +//------------------------------------------------------------------------------ +int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed, ceed_delegate; + Ceed_Magma *data; + char *weight_kernel_path, *basis_kernel_source; + CeedBasisNonTensor_Magma *impl; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedCalloc(1, &impl)); + + // Copy basis data to GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); + if (interp) { + CeedInt q_comp_interp; + + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_nodes * q_comp_interp * sizeof(interp[0]))); + magma_setvector(num_qpts * num_nodes * q_comp_interp, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); + } + if (curl) { + CeedInt q_comp_curl; + + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); + CeedCallBackend(magma_malloc((void **)&impl->d_curl, num_qpts * num_nodes * q_comp_curl * sizeof(curl[0]))); + magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue); + } // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c index 92fc1b3d8d..8abaf83882 100644 --- a/backends/magma/ceed-magma.c +++ b/backends/magma/ceed-magma.c @@ -39,6 +39,8 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Magma)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma)); return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index b5495a61bb..ae5ea39093 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -60,21 +60,26 @@ typedef struct { CeedMagmaModule module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedMagmaFunction Grad[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedMagmaFunction GradTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Weight; CeedInt NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedInt NB_grad[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedInt NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedScalar *d_interp; CeedScalar *d_grad; + CeedScalar *d_div; + CeedScalar *d_curl; CeedScalar *d_q_weight; } CeedBasisNonTensor_Magma; CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); - CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); +CEED_INTERN int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); +CEED_INTERN int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); CEED_INTERN magma_int_t magma_isdevptr(const void *); diff --git a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h deleted file mode 100644 index 95bf548fc3..0000000000 --- a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Internal header for Magma non-tensor basis gradient -#ifndef CEED_MAGMA_GRAD_NONTENSOR_H -#define CEED_MAGMA_GRAD_NONTENSOR_H - -#include "magma-common-nontensor.h" - -//////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ - void magma_grad_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int id = blockIdx.x * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_N); - const int myn = min(BASIS_NB_GRAD_N, n - id * BASIS_NB_GRAD_N); - - dB += id * BASIS_NB_GRAD_N * lddb; - dC += id * BASIS_NB_GRAD_N * lddc; - - // A is BASIS_P x BASIS_Q - const int slda = BASIS_P; - const int sldb = BASIS_P; - CeedScalar *sA = (CeedScalar *)shared_data; - CeedScalar *sB = sA + BASIS_Q * BASIS_P; - sB += ty * sldb * BASIS_NB_GRAD_N; - - // read B once for all C's - if (id < nblocks) { - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - } - - // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll) - for (int d = 0; d < BASIS_DIM; d++) { - // read A (BASIS_P x BASIS_Q) using all threads - CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO}; - __syncthreads(); - read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); - - // init rC - CeedScalar rC[BASIS_NB_GRAD_N] = {MAGMA_D_ZERO}; - if (id < nblocks) { - mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); - } - - // write C - if (id < nblocks) { - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); - } - - dA += BASIS_Q * BASIS_P; - dC += BASIS_Q * n; - } -} - -//////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ - void magma_grad_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int id = blockIdx.x * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_T); - const int myn = min(BASIS_NB_GRAD_T, n - id * BASIS_NB_GRAD_T); - - // terminate threads with no work - if (id >= nblocks) return; - - dB += id * BASIS_NB_GRAD_T * lddb; - dC += id * BASIS_NB_GRAD_T * lddc; - - // A is BASIS_P x BASIS_Q - const int sldb = BASIS_Q; - CeedScalar *sB = (CeedScalar *)shared_data; - sB += ty * sldb * BASIS_NB_GRAD_T; - - // init rA, rC - CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO}; - CeedScalar rC[BASIS_NB_GRAD_T] = {MAGMA_D_ZERO}; - - // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll) - for (int d = 0; d < BASIS_DIM; d++) { - // read A - read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); - - // read B - __syncthreads(); - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - __syncthreads(); - - addmul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); - - dA += BASIS_P * BASIS_Q; - dB += BASIS_Q * n; - } - - // write C - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); -} - -#endif // CEED_MAGMA_GRAD_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h new file mode 100644 index 0000000000..e89c6ade4f --- /dev/null +++ b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h @@ -0,0 +1,182 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for MAGMA non-tensor basis interpolation +#ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H +#define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H + +#include "magma-common-nontensor.h" + +////////////////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + dB += id * P * NB; + dC += id * Q * NB; + + // A is P x Q + CeedScalar *sB = shared_data + ty * P * NB; + CeedScalar *sA = shared_data + blockDim.y * P * NB; + + // read B once for all C's + if (id < nblocks) { + read_B_g2s_1D_nosync(tx, myn, dB, sB); + } + + // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) + for (int d = 0; d < Q_COMP; d++) { + // init rA, rC + CeedScalar rA[P], rC[NB]; + + // read A using all threads + read_A_trans_g2r_1D_nosync(tx, ty, dA, sA, rA); + + if (id < nblocks) { + mul_rAsBrC_1D_nosync(rA, sB, rC); + + // write C + write_C_r2g_1D_nosync(tx, myn, rC, dC); + } + + dA += Q * P; + dC += Q * n; + + __syncthreads(); + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + dB += id * P * NB; + dC += id * Q * NB; + + // A is P x Q + CeedScalar *sA = shared_data; + CeedScalar *sB = shared_data + ty * P * NB; + + // init rA, rC + CeedScalar rA[P], rC[NB]; + + // read A using all threads + read_A_trans_g2r_1D_nosync(tx, ty, dA, sA, rA); + __syncthreads(); + + // terminate threads with no work + if (id >= nblocks) return; + + // read B once for all C's + read_B_g2s_1D_nosync(tx, myn, dB, sB); + __syncthreads(); + + mul_rAsBrC_1D_nosync(rA, sB, rC); + + // write C + write_C_r2g_1D_nosync(tx, myn, rC, dC); +} + +////////////////////////////////////////////////////////////////////////////////////////// +template +static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, + CeedScalar *shared_data) { + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int id = blockIdx.x * blockDim.y + ty; + const int nblocks = (n + NB - 1) / NB; + const int myn = min(NB, n - id * NB); + + // terminate threads with no work + if (id >= nblocks) return; + + dB += id * Q * NB; + dC += id * P * NB; + + // A is P x Q + CeedScalar *sB = shared_data + ty * Q * NB; + + // init rC + CeedScalar rC[NB] = {0.0}; + + // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) + for (int d = 0; d < Q_COMP; d++) { + // init rA + CeedScalar rA[Q]; + + // read A + read_A_notrans_g2r_1D_nosync(tx, dA, rA); + + // read B + read_B_g2s_1D_nosync(tx, myn, dB, sB); + __syncthreads(); + + addmul_rAsBrC_1D_nosync(rA, sB, rC); + + dA += P * Q; + dB += Q * n; + + __syncthreads(); + } + + // write C + write_C_r2g_1D_nosync(tx, myn, rC, dC); +} + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + if (BASIS_Q_COMP_INTERP == 1) { + magma_basis_nontensor_device_n1(n, dA, dB, dC, (CeedScalar *)shared_data); + } else { + magma_basis_nontensor_device_n(n, dA, dB, dC, (CeedScalar *)shared_data); + } +} + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + magma_basis_nontensor_device_t(n, dA, dB, dC, (CeedScalar *)shared_data); +} + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ + void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + if (BASIS_Q_COMP_DERIV == 1) { + magma_basis_nontensor_device_n1(n, dA, dB, dC, (CeedScalar *)shared_data); + } else { + magma_basis_nontensor_device_n(n, dA, dB, dC, (CeedScalar *)shared_data); + } +} + +//////////////////////////////////////////////////////////////////////////////// +extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ + void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { + MAGMA_DEVICE_SHARED(CeedScalar, shared_data); + + magma_basis_nontensor_device_t(n, dA, dB, dC, (CeedScalar *)shared_data); +} + +#endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h deleted file mode 100644 index 956d69392a..0000000000 --- a/include/ceed/jit-source/magma/magma-basis-interp-nontensor.h +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Internal header for MAGMA non-tensor basis interpolation -#ifndef CEED_MAGMA_INTERP_NONTENSOR_H -#define CEED_MAGMA_INTERP_NONTENSOR_H - -#include "magma-common-nontensor.h" - -//////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ - void magma_interp_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int id = blockIdx.x * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_N); - const int myn = min(BASIS_NB_INTERP_N, n - id * BASIS_NB_INTERP_N); - - dB += id * BASIS_NB_INTERP_N * lddb; - dC += id * BASIS_NB_INTERP_N * lddc; - - // A is BASIS_P x BASIS_Q - const int slda = BASIS_P; - const int sldb = BASIS_P; - CeedScalar *sA = (CeedScalar *)shared_data; - CeedScalar *sB = sA; - sB += ty * sldb * BASIS_NB_INTERP_N; - - // read A using all threads - CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO}; - read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); - __syncthreads(); - - // terminate threads with no work - if (id >= nblocks) return; - - // init rC - CeedScalar rC[BASIS_NB_INTERP_N] = {MAGMA_D_ZERO}; - - // read B - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - __syncthreads(); - - mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); - - // write C - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); -} - -//////////////////////////////////////////////////////////////////////////////// -extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ - void magma_interp_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { - MAGMA_DEVICE_SHARED(CeedScalar, shared_data); - - const int tx = threadIdx.x; - const int ty = threadIdx.y; - const int id = blockIdx.x * blockDim.y + ty; - const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_INTERP_T); - const int myn = min(BASIS_NB_INTERP_T, n - id * BASIS_NB_INTERP_T); - - // terminate threads with no work - if (id >= nblocks) return; - - dB += id * BASIS_NB_INTERP_T * lddb; - dC += id * BASIS_NB_INTERP_T * lddc; - - // A is BASIS_P x BASIS_Q - const int sldb = BASIS_Q; - CeedScalar *sB = (CeedScalar *)shared_data; - sB += ty * sldb * BASIS_NB_INTERP_T; - - // init rC - CeedScalar rC[BASIS_NB_INTERP_T] = {MAGMA_D_ZERO}; - - // read A - CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO}; - read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); - - // read B - read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); - __syncthreads(); - - mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); - - // write C - write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); -} - -#endif // CEED_MAGMA_INTERP_NONTENSOR_H diff --git a/include/ceed/jit-source/magma/magma-basis-weight-1d.h b/include/ceed/jit-source/magma/magma-basis-weight-1d.h index 9f77dd4aca..38a058ea67 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-1d.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-1d.h @@ -18,7 +18,7 @@ template static __device__ __inline__ void magma_weight_1d_device(const T *sTweight, T *sV, const int tx) { // Assumptions // 1. 1D thread configuration of size Q - // 2. The output sV is in shared memory -- size 1xQ + // 2. The output sV is in shared memory -- size Q if (tx < Q) { sV[tx] = sTweight[tx]; } diff --git a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h index f6b51380cf..b1f2b2881b 100644 --- a/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h +++ b/include/ceed/jit-source/magma/magma-basis-weight-nontensor.h @@ -14,7 +14,7 @@ //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ - void magma_weight_nontensor(int n, const CeedScalar *dqweight, CeedScalar *dV, int lddv) { + void magma_weight_nontensor(int n, const CeedScalar *__restrict__ dqweight, CeedScalar *__restrict__ dV) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data); const int tx = threadIdx.x; @@ -24,7 +24,7 @@ extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) _ // terminate threads with no work if (id >= n) return; - dV += id * lddv; + dV += id * BASIS_Q; // shared memory pointers CeedScalar *sqweight = (CeedScalar *)shared_data; diff --git a/include/ceed/jit-source/magma/magma-common-defs.h b/include/ceed/jit-source/magma/magma-common-defs.h index 24684be85e..30c68dfb1e 100644 --- a/include/ceed/jit-source/magma/magma-common-defs.h +++ b/include/ceed/jit-source/magma/magma-common-defs.h @@ -10,20 +10,7 @@ #ifndef CEED_MAGMA_COMMON_DEFS_H #define CEED_MAGMA_COMMON_DEFS_H -#ifdef CEED_MAGMA_USE_HIP -#define MAGMA_DEVICE_SHARED(type, name) HIP_DYNAMIC_SHARED(type, name) -#else #define MAGMA_DEVICE_SHARED(type, name) extern __shared__ type name[]; -#endif - -typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t; - -#define MAGMA_D_ZERO 0.0 -#define MAGMA_D_ONE 1.0 - -#define MAGMA_CEILDIV(A, B) (((A) + (B)-1) / (B)) -#define MAGMA_ROUNDUP(A, B) MAGMA_CEILDIV((A), (B)) * (B) -#define MAGMA_MAX(A, B) ((A) > (B) ? (A) : (B)) #define MAGMA_MAXTHREADS_1D 128 #define MAGMA_MAXTHREADS_2D 128 @@ -35,4 +22,6 @@ typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma // Define macro for computing the total threads in a block for use with __launch_bounds__() #define MAGMA_BASIS_BOUNDS(x, maxt) (x * MAGMA_BASIS_NTCOL(x, maxt)) +typedef enum { MagmaNoTrans = 111, MagmaTrans = 112, MagmaConjTrans = 113, Magma_ConjTrans = MagmaConjTrans } magma_trans_t; + #endif // CEED_MAGMA_COMMON_DEFS_H diff --git a/include/ceed/jit-source/magma/magma-common-nontensor.h b/include/ceed/jit-source/magma/magma-common-nontensor.h index 0e1bbb007b..38ca314c04 100644 --- a/include/ceed/jit-source/magma/magma-common-nontensor.h +++ b/include/ceed/jit-source/magma/magma-common-nontensor.h @@ -18,36 +18,35 @@ // 1D thread config. with (P x 1) threads // no sync at the end of the function template -static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, int ldda, T *sA, int slda, T rA[Q]) { +static __device__ __inline__ void read_A_notrans_g2r_1D_nosync(const int tx, const T *dA, T rA[Q]) { #pragma unroll - for (int j = 0; j < Q; j++) { - rA[j] = dA[j * ldda + tx]; + for (int i = 0; i < Q; i++) { + rA[i] = dA[i * P + tx]; } } //////////////////////////////////////////////////////////////////////////////// // read A (trans) from global to reg. // A is (P x Q) -// 1D thread config. with (P x 1) threads +// 2D thread config. with (P x BY) threads // no sync at the end of the function -template -static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, int ldda, T *sA, int slda, T rA[Q]) { - const int nTH = MAGMA_BASIS_BOUNDS(P, MAGMA_MAXTHREADS_1D); +template +static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const int ty, const T *dA, T *sA, T rA[Q]) { const int tid = ty * blockDim.x + tx; int i; #pragma unroll - for (i = 0; i < (Q * P) - nTH; i += nTH) { + for (i = 0; i < P * Q - P * BY; i += P * BY) { sA[i + tid] = dA[i + tid]; } - if (tid < ((Q * P) - i)) { + if (i + tid < P * Q) { sA[i + tid] = dA[i + tid]; } __syncthreads(); #pragma unroll for (int j = 0; j < Q; j++) { - rA[j] = sA[tx * slda + j]; + rA[j] = sA[tx * Q + j]; } } @@ -57,22 +56,21 @@ static __device__ __inline__ void read_A_trans_g2r_1D_nosync(const int tx, const // 1D thread config. with (P x 1) threads // no sync at the end of the function template -static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, int lddb, T *sB, int sldb) { +static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n, const T *dB, T *sB) { + int i; + if (n != NB) { - for (int i = 0; i < (Q * n) - P; i += P) { + for (i = 0; i < Q * n - P; i += P) { sB[i + tx] = dB[i + tx]; } } else { #pragma unroll - for (int i = 0; i < (Q * NB) - P; i += P) { + for (i = 0; i < Q * NB - P; i += P) { sB[i + tx] = dB[i + tx]; } } - - // cleanup for B - const int stride = MAGMA_ROUNDUP(Q * n - P, P); - if (tx < (Q * n) - stride) { - sB[stride + tx] = dB[stride + tx]; + if (i + tx < Q * n) { + sB[i + tx] = dB[i + tx]; } } @@ -82,18 +80,15 @@ static __device__ __inline__ void read_B_g2s_1D_nosync(const int tx, const int n // 1D thread config. with (P x 1) threads // no sync at the end of the function template -static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC, int lddc) { +static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int n, T rC[NB], T *dC) { if (n != NB) { -#pragma unroll - for (int j = 0; j < NB; j++) { - if (j < n) { - dC[j * lddc + tx] = rC[j]; - } + for (int i = 0; i < n; i++) { + dC[i * P + tx] = rC[i]; } } else { #pragma unroll - for (int j = 0; j < NB; j++) { - dC[j * lddc + tx] = rC[j]; + for (int i = 0; i < NB; i++) { + dC[i * P + tx] = rC[i]; } } } @@ -105,18 +100,19 @@ static __device__ __inline__ void write_C_r2g_1D_nosync(const int tx, const int // C in registers -- one row per thread // no sync at the end of the function template -static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) { +static __device__ __inline__ void mul_rAsBrC_1D_nosync(T rA[Q], T *sB, T rC[NB]) { T rB[Q]; + #pragma unroll for (int i = 0; i < NB; i++) { #pragma unroll - for (int k = 0; k < Q; k++) { - rB[k] = sB[i * sldb + k]; + for (int j = 0; j < Q; j++) { + rB[j] = sB[i * Q + j]; } rC[i] = 0.0; #pragma unroll - for (int k = 0; k < Q; k++) { - rC[i] += rA[k] * rB[k]; + for (int j = 0; j < Q; j++) { + rC[i] += rA[j] * rB[j]; } } } @@ -128,17 +124,18 @@ static __device__ __inline__ void mul_rAsBrC_1D_nosync(const int tx, T rA[Q], T // C in registers -- one row per thread // no sync at the end of the function template -static __device__ __inline__ void addmul_rAsBrC_1D_nosync(const int tx, T rA[Q], T *sB, int sldb, T rC[NB]) { +static __device__ __inline__ void addmul_rAsBrC_1D_nosync(T rA[Q], T *sB, T rC[NB]) { T rB[Q]; + #pragma unroll for (int i = 0; i < NB; i++) { #pragma unroll - for (int k = 0; k < Q; k++) { - rB[k] = sB[i * sldb + k]; + for (int j = 0; j < Q; j++) { + rB[j] = sB[i * Q + j]; } #pragma unroll - for (int k = 0; k < Q; k++) { - rC[i] += rA[k] * rB[k]; + for (int j = 0; j < Q; j++) { + rC[i] += rA[j] * rB[j]; } } }