Skip to content

Commit

Permalink
H(div) and H(curl) basis support for magma backend
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastiangrimberg committed Oct 18, 2023
1 parent 1d90b3a commit ef845ad
Show file tree
Hide file tree
Showing 10 changed files with 432 additions and 329 deletions.
268 changes: 201 additions & 67 deletions backends/magma/ceed-magma-basis.c

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions backends/magma/ceed-magma.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) {

CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma));
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma));
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Magma));
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Magma));
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma));
return CEED_ERROR_SUCCESS;
}
Expand Down
13 changes: 9 additions & 4 deletions backends/magma/ceed-magma.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,21 +60,26 @@ typedef struct {
CeedMagmaModule module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction Grad[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction GradTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction DerivTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedMagmaFunction Weight;
CeedInt NB_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedInt NB_grad[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedInt NB_deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES], NB_deriv_t[MAGMA_NONTENSOR_KERNEL_INSTANCES];
CeedScalar *d_interp;
CeedScalar *d_grad;
CeedScalar *d_div;
CeedScalar *d_curl;
CeedScalar *d_q_weight;
} CeedBasisNonTensor_Magma;

CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis);

CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
CEED_INTERN int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);
CEED_INTERN int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis);

CEED_INTERN magma_int_t magma_isdevptr(const void *);

Expand Down
110 changes: 0 additions & 110 deletions include/ceed/jit-source/magma/magma-basis-grad-nontensor.h

This file was deleted.

182 changes: 182 additions & 0 deletions include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

/// @file
/// Internal header for MAGMA non-tensor basis interpolation
#ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
#define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H

#include "magma-common-nontensor.h"

//////////////////////////////////////////////////////////////////////////////////////////
template <typename T, int Q_COMP, int P, int Q, int NB>
static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
CeedScalar *shared_data) {
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int id = blockIdx.x * blockDim.y + ty;
const int nblocks = (n + NB - 1) / NB;
const int myn = min(NB, n - id * NB);

dB += id * P * NB;
dC += id * Q * NB;

// A is P x Q
CeedScalar *sB = shared_data + ty * P * NB;
CeedScalar *sA = shared_data + blockDim.y * P * NB;

// read B once for all C's
if (id < nblocks) {
read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
}

// unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
for (int d = 0; d < Q_COMP; d++) {
// init rA, rC
CeedScalar rA[P], rC[NB];

// read A using all threads
read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);

if (id < nblocks) {
mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);

// write C
write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
}

dA += Q * P;
dC += Q * n;

__syncthreads();
}
}

//////////////////////////////////////////////////////////////////////////////////////////
template <typename T, int P, int Q, int NB>
static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
CeedScalar *shared_data) {
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int id = blockIdx.x * blockDim.y + ty;
const int nblocks = (n + NB - 1) / NB;
const int myn = min(NB, n - id * NB);

dB += id * P * NB;
dC += id * Q * NB;

// A is P x Q
CeedScalar *sA = shared_data;
CeedScalar *sB = shared_data + ty * P * NB;

// init rA, rC
CeedScalar rA[P], rC[NB];

// read A using all threads
read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
__syncthreads();

// terminate threads with no work
if (id >= nblocks) return;

// read B once for all C's
read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
__syncthreads();

mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);

// write C
write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
}

//////////////////////////////////////////////////////////////////////////////////////////
template <typename T, int Q_COMP, int P, int Q, int NB>
static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
CeedScalar *shared_data) {
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int id = blockIdx.x * blockDim.y + ty;
const int nblocks = (n + NB - 1) / NB;
const int myn = min(NB, n - id * NB);

// terminate threads with no work
if (id >= nblocks) return;

dB += id * Q * NB;
dC += id * P * NB;

// A is P x Q
CeedScalar *sB = shared_data + ty * Q * NB;

// init rC
CeedScalar rC[NB] = {0.0};

// unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
for (int d = 0; d < Q_COMP; d++) {
// init rA
CeedScalar rA[Q];

// read A
read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA);

// read B
read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
__syncthreads();

addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);

dA += P * Q;
dB += Q * n;

__syncthreads();
}

// write C
write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
}

////////////////////////////////////////////////////////////////////////////////
extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
MAGMA_DEVICE_SHARED(CeedScalar, shared_data);

if (BASIS_Q_COMP_INTERP == 1) {
magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
} else {
magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
}
}

////////////////////////////////////////////////////////////////////////////////
extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
MAGMA_DEVICE_SHARED(CeedScalar, shared_data);

magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
}

////////////////////////////////////////////////////////////////////////////////
extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
MAGMA_DEVICE_SHARED(CeedScalar, shared_data);

if (BASIS_Q_COMP_DERIV == 1) {
magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
} else {
magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
}
}

////////////////////////////////////////////////////////////////////////////////
extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
MAGMA_DEVICE_SHARED(CeedScalar, shared_data);

magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
}

#endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
Loading

0 comments on commit ef845ad

Please sign in to comment.