H(div) and H(curl) basis support for magma backend

CEED · Oct 18, 2023 · ef845ad · ef845ad
1 parent 1d90b3a
commit ef845ad
Show file tree

Hide file tree

Showing 10 changed files with 432 additions and 329 deletions.
diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c
@@ -39,6 +39,8 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) {
 
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Magma));
+  CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Magma));
   CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma));
   return CEED_ERROR_SUCCESS;
 }

diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
@@ -60,21 +60,26 @@ typedef struct {
   CeedMagmaModule   module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction Grad[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedMagmaFunction GradTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Weight;
   CeedInt           NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
-  CeedInt           NB_grad[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedInt           NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedScalar       *d_interp;
   CeedScalar       *d_grad;
+  CeedScalar       *d_div;
+  CeedScalar       *d_curl;
   CeedScalar       *d_q_weight;
 } CeedBasisNonTensor_Magma;
 
 CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
                                               const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);
-
 CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                                         const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
+CEED_INTERN int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                          const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
+CEED_INTERN int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
+                                           const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
 
 CEED_INTERN magma_int_t magma_isdevptr(const void *);
 

diff --git a/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h b/include/ceed/jit-source/magma/magma-basis-grad-nontensor.h
diff --git a/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h b/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
@@ -0,0 +1,182 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for MAGMA non-tensor basis interpolation
+#ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
+#define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
+
+#include "magma-common-nontensor.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                 CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * P * NB;
+  dC += id * Q * NB;
+
+  // A is P x Q
+  CeedScalar *sB = shared_data + ty * P * NB;
+  CeedScalar *sA = shared_data + blockDim.y * P * NB;
+
+  // read B once for all C's
+  if (id < nblocks) {
+    read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
+  }
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // init rA, rC
+    CeedScalar rA[P], rC[NB];
+
+    // read A using all threads
+    read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+
+    if (id < nblocks) {
+      mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
+
+      // write C
+      write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
+    }
+
+    dA += Q * P;
+    dC += Q * n;
+
+    __syncthreads();
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                  CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  dB += id * P * NB;
+  dC += id * Q * NB;
+
+  // A is P x Q
+  CeedScalar *sA = shared_data;
+  CeedScalar *sB = shared_data + ty * P * NB;
+
+  // init rA, rC
+  CeedScalar rA[P], rC[NB];
+
+  // read A using all threads
+  read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
+  __syncthreads();
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  // read B once for all C's
+  read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
+  __syncthreads();
+
+  mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename T, int Q_COMP, int P, int Q, int NB>
+static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
+                                                                 CeedScalar *shared_data) {
+  const int tx      = threadIdx.x;
+  const int ty      = threadIdx.y;
+  const int id      = blockIdx.x * blockDim.y + ty;
+  const int nblocks = (n + NB - 1) / NB;
+  const int myn     = min(NB, n - id * NB);
+
+  // terminate threads with no work
+  if (id >= nblocks) return;
+
+  dB += id * Q * NB;
+  dC += id * P * NB;
+
+  // A is P x Q
+  CeedScalar *sB = shared_data + ty * Q * NB;
+
+  // init rC
+  CeedScalar rC[NB] = {0.0};
+
+  // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
+  for (int d = 0; d < Q_COMP; d++) {
+    // init rA
+    CeedScalar rA[Q];
+
+    // read A
+    read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA);
+
+    // read B
+    read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
+    __syncthreads();
+
+    addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
+
+    dA += P * Q;
+    dB += Q * n;
+
+    __syncthreads();
+  }
+
+  // write C
+  write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  if (BASIS_Q_COMP_INTERP == 1) {
+    magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  } else {
+    magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  if (BASIS_Q_COMP_DERIV == 1) {
+    magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  } else {
+    magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
+    void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
+  MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
+
+  magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
+}
+
+#endif  // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H