Skip to content

Commit

Permalink
Merge remote-tracking branch 'argonne-lcf-libceed/occa-backend-update…
Browse files Browse the repository at this point in the history
…' into occa-backend-update
  • Loading branch information
jeremylt committed Oct 11, 2022
2 parents 9e201c8 + 8801fe3 commit d8f39b6
Show file tree
Hide file tree
Showing 25 changed files with 1,237 additions and 333 deletions.
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,27 +92,33 @@ AFLAGS = -fsanitize=address #-fsanitize=undefined -fno-omit-frame-pointer

# Note: Intel oneAPI C/C++ compiler is now icx/icpx
CC_VENDOR := $(subst icc_orig,icc,$(subst oneAPI,icc,$(firstword $(filter gcc clang icc icc_orig oneAPI XL,$(subst -, ,$(shell $(CC) --version))))))
FC_VENDOR := $(if $(FC),$(firstword $(filter GNU ifort XL,$(shell $(FC) --version 2>&1 || $(FC) -qversion))))
FC_VENDOR := $(if $(FC),$(firstword $(filter GNU ifort ifx XL,$(shell $(FC) --version 2>&1 || $(FC) -qversion))))

# Default extra flags by vendor
MARCHFLAG.gcc := -march=native
MARCHFLAG.clang := $(MARCHFLAG.gcc)
MARCHFLAG.icc :=
MARCHFLAG.oneAPI := $(MARCHFLAG.clang)
OMP_SIMD_FLAG.gcc := -fopenmp-simd
OMP_SIMD_FLAG.clang := $(OMP_SIMD_FLAG.gcc)
OMP_SIMD_FLAG.icc := -qopenmp-simd
OMP_SIMD_FLAG.oneAPI := $(OMP_SIMD_FLAG.clang)
OPT.gcc := -ffp-contract=fast
OPT.clang := $(OPT.gcc)
OPT.oneAPI := $(OPT.clang)
CFLAGS.gcc := -fPIC -std=c99 -Wall -Wextra -Wno-unused-parameter -MMD -MP
CFLAGS.clang := $(CFLAGS.gcc)
CFLAGS.icc := $(CFLAGS.gcc)
CFLAGS.oneAPI := $(CFLAGS.clang)
CFLAGS.XL := -qpic -MMD
CXXFLAGS.gcc := -fPIC -std=c++11 -Wall -Wextra -Wno-unused-parameter -MMD -MP
CXXFLAGS.clang := $(CXXFLAGS.gcc)
CXXFLAGS.icc := $(CXXFLAGS.gcc)
CXXFLAGS.oneAPI := $(CXXFLAGS.clang)
CXXFLAGS.XL := -qpic -std=c++11 -MMD
FFLAGS.GNU := -fPIC -cpp -Wall -Wextra -Wno-unused-parameter -Wno-unused-dummy-argument -MMD -MP
FFLAGS.ifort := -fPIC -cpp
FFLAGS.ifx := $(FFLAGS.ifort)
FFLAGS.XL := -qpic -ffree-form -qpreprocess -qextname -MMD

# This check works with compilers that use gcc and clang. It fails with some
Expand Down Expand Up @@ -371,7 +377,8 @@ OCCA_BACKENDS = /cpu/self/occa
ifneq ($(wildcard $(OCCA_DIR)/lib/libocca.*),)
OCCA_MODES := $(shell $(OCCA_DIR)/bin/occa modes)
OCCA_BACKENDS += $(if $(filter OpenMP,$(OCCA_MODES)),/cpu/openmp/occa)
# OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa)
OCCA_BACKENDS += $(if $(filter dpcpp,$(OCCA_MODES)),/gpu/dpcpp/occa)
OCCA_BACKENDS += $(if $(filter OpenCL,$(OCCA_MODES)),/gpu/opencl/occa)
OCCA_BACKENDS += $(if $(filter HIP,$(OCCA_MODES)),/gpu/hip/occa)
OCCA_BACKENDS += $(if $(filter CUDA,$(OCCA_MODES)),/gpu/cuda/occa)

Expand Down
2 changes: 1 addition & 1 deletion backends/ceed-backend-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ MACRO(CeedRegister_Magma, 2, "/gpu/cuda/magma", "/gpu/hip/magma")
MACRO(CeedRegister_Magma_Det, 2, "/gpu/cuda/magma/det", "/gpu/hip/magma/det")
MACRO(CeedRegister_Memcheck_Blocked, 1, "/cpu/self/memcheck/blocked")
MACRO(CeedRegister_Memcheck_Serial, 1, "/cpu/self/memcheck/serial")
MACRO(CeedRegister_Occa, 4, "/cpu/self/occa", "/cpu/openmp/occa", "/gpu/hip/occa", "/gpu/cuda/occa")
MACRO(CeedRegister_Occa, 6, "/cpu/self/occa", "/cpu/openmp/occa", "/gpu/dpcpp/occa", "/gpu/opencl/occa", "/gpu/hip/occa", "/gpu/cuda/occa")
MACRO(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
MACRO(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
MACRO(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
Expand Down
94 changes: 42 additions & 52 deletions backends/occa/ceed-occa-elem-restriction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// testbed platforms, in support of the nation's exascale computing imperative.

#include <map>
#include <cstring>

#include "./ceed-occa-elem-restriction.hpp"
#include "./ceed-occa-kernels.hpp"
Expand Down Expand Up @@ -54,8 +55,6 @@ namespace ceed {
}

setupTransposeIndices();

setupKernelBuilders();
}

void ElemRestriction::setupFromHostMemory(CeedCopyMode copyMode,
Expand All @@ -69,7 +68,7 @@ namespace ceed {
} else {
const size_t bytes = entries * sizeof(CeedInt);
hostIndices = (CeedInt*) ::malloc(bytes);
::memcpy(hostIndices, indices_h, bytes);
std::memcpy(hostIndices, indices_h, bytes);
}

if (hostIndices) {
Expand Down Expand Up @@ -102,7 +101,7 @@ namespace ceed {
const CeedInt elementEntryCount = ceedElementCount * ceedElementSize;

bool *indexIsUsed = new bool[ceedLVectorSize];
::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool));
std::memset(indexIsUsed, 0, ceedLVectorSize * sizeof(bool));

for (CeedInt i = 0; i < elementEntryCount; ++i) {
indexIsUsed[hostIndices[i]] = true;
Expand All @@ -119,7 +118,7 @@ namespace ceed {
CeedInt *transposeDofOffsets_h = new CeedInt[dofOffsetCount];
CeedInt *transposeDofIndices_h = new CeedInt[elementEntryCount];

::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt));
std::memset(transposeDofOffsets_h, 0, dofOffsetCount * sizeof(CeedInt));

// Compute ids
CeedInt offsetId = 0;
Expand Down Expand Up @@ -175,23 +174,22 @@ namespace ceed {
delete [] transposeDofIndices_h;
}

void ElemRestriction::setupKernelBuilders() {
::occa::properties kernelProps;
kernelProps["defines/CeedInt"] = ::occa::dtype::get<CeedInt>().name();
kernelProps["defines/CeedScalar"] = ::occa::dtype::get<CeedScalar>().name();

kernelProps["defines/COMPONENT_COUNT"] = ceedComponentCount;
kernelProps["defines/ELEMENT_SIZE"] = ceedElementSize;
kernelProps["defines/TILE_SIZE"] = 64;
kernelProps["defines/USES_INDICES"] = usesIndices();

applyKernelBuilder = ::occa::kernelBuilder::fromString(
occa_elem_restriction_source, "applyRestriction", kernelProps
);

applyTransposeKernelBuilder = ::occa::kernelBuilder::fromString(
occa_elem_restriction_source, "applyRestrictionTranspose", kernelProps
);
void ElemRestriction::setKernelProperties() {
kernelProperties["defines/CeedInt"] = ::occa::dtype::get<CeedInt>().name();
kernelProperties["defines/CeedScalar"] = ::occa::dtype::get<CeedScalar>().name();
kernelProperties["defines/COMPONENT_COUNT"] = ceedComponentCount;
kernelProperties["defines/ELEMENT_SIZE"] = ceedElementSize;
kernelProperties["defines/TILE_SIZE"] = 64;
kernelProperties["defines/USES_INDICES"] = usesIndices();
kernelProperties["defines/USER_STRIDES"] = StrideType::USER_STRIDES;
kernelProperties["defines/NOT_STRIDED"] = StrideType::NOT_STRIDED;
kernelProperties["defines/BACKEND_STRIDES"] = StrideType::BACKEND_STRIDES;
kernelProperties["defines/STRIDE_TYPE"] = ceedStrideType;
kernelProperties["defines/NODE_COUNT"] = transposeQuadIndices.length();
kernelProperties["defines/NODE_STRIDE"] = ceedNodeStride;
kernelProperties["defines/COMPONENT_STRIDE"] = ceedComponentStride;
kernelProperties["defines/ELEMENT_STRIDE"] = ceedElementStride;
kernelProperties["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride;
}

ElemRestriction* ElemRestriction::getElemRestriction(CeedElemRestriction r,
Expand Down Expand Up @@ -300,42 +298,34 @@ namespace ceed {
Vector &v) {
const bool rIsTransposed = (rTransposeMode != CEED_NOTRANSPOSE);

::occa::properties kernelProps;
kernelProps["defines/USER_STRIDES"] = StrideType::USER_STRIDES;
kernelProps["defines/NOT_STRIDED"] = StrideType::NOT_STRIDED;
kernelProps["defines/BACKEND_STRIDES"] = StrideType::BACKEND_STRIDES;
kernelProps["defines/STRIDE_TYPE"] = ceedStrideType;

kernelProps["defines/NODE_COUNT"] = transposeQuadIndices.length();
kernelProps["defines/NODE_STRIDE"] = ceedNodeStride;
kernelProps["defines/COMPONENT_STRIDE"] = ceedComponentStride;
kernelProps["defines/ELEMENT_STRIDE"] = ceedElementStride;
kernelProps["defines/UNSTRIDED_COMPONENT_STRIDE"] = ceedUnstridedComponentStride;

// Todo: refactor
if (rIsTransposed) {
::occa::kernel applyTranspose = applyTransposeKernelBuilder.build(
getDevice(),
kernelProps
);

applyTranspose(ceedElementCount,
transposeQuadIndices,
transposeDofOffsets,
transposeDofIndices,
u.getConstKernelArg(),
v.getKernelArg());
if(!restrictionTransposeKernel.isInitialized()) {
setKernelProperties();
restrictionTransposeKernel = getDevice().buildKernelFromString(
occa_elem_restriction_source,
"applyRestrictionTranspose",
kernelProperties);
}
restrictionTransposeKernel(ceedElementCount,
transposeQuadIndices,
transposeDofOffsets,
transposeDofIndices,
u.getConstKernelArg(),
v.getKernelArg());
} else {
::occa::kernel apply = applyKernelBuilder.build(
getDevice(),
kernelProps
);

apply(ceedElementCount,
if(!restrictionKernel.isInitialized()) {
setKernelProperties();
restrictionKernel = getDevice().buildKernelFromString(
occa_elem_restriction_source,
"applyRestriction",
kernelProperties);
}
restrictionKernel(ceedElementCount,
indices,
u.getConstKernelArg(),
v.getKernelArg());
}

return CEED_ERROR_SUCCESS;
}

Expand Down
7 changes: 4 additions & 3 deletions backends/occa/ceed-occa-elem-restriction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ namespace ceed {
::occa::memory transposeDofOffsets;
::occa::memory transposeDofIndices;

::occa::kernelBuilder applyKernelBuilder;
::occa::kernelBuilder applyTransposeKernelBuilder;
::occa::json kernelProperties;
::occa::kernel restrictionKernel;
::occa::kernel restrictionTransposeKernel;

ElemRestriction();

Expand All @@ -74,7 +75,7 @@ namespace ceed {

void setupTransposeIndices();

void setupKernelBuilders();
void setKernelProperties();

static ElemRestriction* getElemRestriction(CeedElemRestriction r,
const bool assertValid = true);
Expand Down
1 change: 1 addition & 0 deletions backends/occa/ceed-occa-kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#define CEED_OCCA_KERNELS_HEADER

#include "./kernels/elem-restriction.hpp"
#include "./kernels/set-value.hpp"
#include "./kernels/simplex-basis.hpp"
#include "./kernels/tensor-basis.hpp"

Expand Down
6 changes: 6 additions & 0 deletions backends/occa/ceed-occa-operator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ namespace ceed {
ierr = CeedOperatorSetData(op, operator_); CeedChk(ierr);

CeedOccaRegisterFunction(op, "LinearAssembleQFunction", Operator::ceedLinearAssembleQFunction);
CeedOccaRegisterFunction(op, "LinearAssembleQFunctionUpdate", Operator::ceedLinearAssembleQFunction);
CeedOccaRegisterFunction(op, "LinearAssembleAddDiagonal", Operator::ceedLinearAssembleAddDiagonal);
CeedOccaRegisterFunction(op, "LinearAssembleAddPointBlockDiagonal", Operator::ceedLinearAssembleAddPointBlockDiagonal);
CeedOccaRegisterFunction(op, "CreateFDMElementInverse", Operator::ceedCreateFDMElementInverse);
Expand All @@ -143,10 +144,15 @@ namespace ceed {
return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunction");
}

int Operator::ceedLinearAssembleQFunctionUpdate(CeedOperator op) {
return staticCeedError("(OCCA) Backend does not implement LinearAssembleQFunctionUpdate");
}

int Operator::ceedLinearAssembleAddDiagonal(CeedOperator op) {
return staticCeedError("(OCCA) Backend does not implement LinearAssembleDiagonal");
}


int Operator::ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op) {
return staticCeedError("(OCCA) Backend does not implement LinearAssemblePointBlockDiagonal");
}
Expand Down
1 change: 1 addition & 0 deletions backends/occa/ceed-occa-operator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ namespace ceed {
static int ceedCreateComposite(CeedOperator op);

static int ceedLinearAssembleQFunction(CeedOperator op);
static int ceedLinearAssembleQFunctionUpdate(CeedOperator op);
static int ceedLinearAssembleAddDiagonal(CeedOperator op);
static int ceedLinearAssembleAddPointBlockDiagonal(CeedOperator op);
static int ceedCreateFDMElementInverse(CeedOperator op);
Expand Down
30 changes: 16 additions & 14 deletions backends/occa/ceed-occa-qfunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@
// testbed platforms, in support of the nation's exascale computing imperative.

#include <sstream>
#include <string>

#include "ceed-occa-qfunction.hpp"
#include "ceed-occa-qfunctioncontext.hpp"
#include "ceed-occa-vector.hpp"

namespace ceed {
namespace occa {
QFunction::QFunction(const std::string &source) :
QFunction::QFunction(const std::string &source,
const std::string& function_name) :
ceedIsIdentity(false) {

const size_t colonIndex = source.find(':');
filename = source.substr(0, colonIndex);
qFunctionName = source.substr(colonIndex + 1);
filename = source;
qFunctionName = function_name;
}

QFunction* QFunction::getQFunction(CeedQFunction qf,
Expand Down Expand Up @@ -116,7 +116,7 @@ namespace ceed {
// Properties only used in the QFunction kernel source
props["defines/OCCA_Q"] = Q;

const std::string kernelName = "qFunctionKernel";
const std::string kernelName = "qf_" + qFunctionName;

qFunctionKernel = (
getDevice().buildKernelFromString(getKernelSource(kernelName, Q),
Expand Down Expand Up @@ -154,8 +154,8 @@ namespace ceed {
// Set and define in for the q point
for (int i = 0; i < args.inputCount(); ++i) {
const CeedInt fieldSize = args.getQfInput(i).size;
const std::string qIn_i = "qIn" + ::occa::toString(i);
const std::string in_i = "in" + ::occa::toString(i);
const std::string qIn_i = "qIn" + std::to_string(i);
const std::string in_i = "in" + std::to_string(i);

ss << " CeedScalar " << qIn_i << "[" << fieldSize << "];" << std::endl
<< " in[" << i << "] = " << qIn_i << ";" << std::endl
Expand All @@ -168,7 +168,7 @@ namespace ceed {
// Set out for the q point
for (int i = 0; i < args.outputCount(); ++i) {
const CeedInt fieldSize = args.getQfOutput(i).size;
const std::string qOut_i = "qOut" + ::occa::toString(i);
const std::string qOut_i = "qOut" + std::to_string(i);

ss << " CeedScalar " << qOut_i << "[" << fieldSize << "];" << std::endl
<< " out[" << i << "] = " << qOut_i << ";" << std::endl;
Expand All @@ -179,8 +179,8 @@ namespace ceed {
// Copy out for the q point
for (int i = 0; i < args.outputCount(); ++i) {
const CeedInt fieldSize = args.getQfOutput(i).size;
const std::string qOut_i = "qOut" + ::occa::toString(i);
const std::string out_i = "out" + ::occa::toString(i);
const std::string qOut_i = "qOut" + std::to_string(i);
const std::string out_i = "out" + std::to_string(i);

ss << " for (int qi = 0; qi < " << fieldSize << "; ++qi) {" << std::endl
<< " " << out_i << "[q + (OCCA_Q * qi)] = " << qOut_i << "[qi];" << std::endl
Expand All @@ -204,15 +204,15 @@ namespace ceed {
for (CeedInt i = 0; i < args.inputCount(); i++) {
Vector *u = Vector::from(U[i]);
if (!u) {
return ceedError("Incorrect qFunction input field: U[" + ::occa::toString(i) + "]");
return ceedError("Incorrect qFunction input field: U[" + std::to_string(i) + "]");
}
qFunctionKernel.pushArg(u->getConstKernelArg());
}

for (CeedInt i = 0; i < args.outputCount(); i++) {
Vector *v = Vector::from(V[i]);
if (!v) {
return ceedError("Incorrect qFunction output field: V[" + ::occa::toString(i) + "]");
return ceedError("Incorrect qFunction output field: V[" + std::to_string(i) + "]");
}
qFunctionKernel.pushArg(v->getKernelArg());
}
Expand Down Expand Up @@ -242,8 +242,10 @@ namespace ceed {
ierr = CeedGetData(ceed, &context); CeedChk(ierr);
char *source;
ierr = CeedQFunctionGetSourcePath(qf, &source); CeedChk(ierr);
char *function_name;
ierr = CeedQFunctionGetKernelName(qf,&function_name); CeedChk(ierr);

QFunction *qFunction = new QFunction(source);
QFunction *qFunction = new QFunction(source,function_name);
ierr = CeedQFunctionSetData(qf, qFunction); CeedChk(ierr);

CeedOccaRegisterFunction(qf, "Apply", QFunction::ceedApply);
Expand Down
3 changes: 2 additions & 1 deletion backends/occa/ceed-occa-qfunction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ namespace ceed {
CeedQFunctionContext qFunctionContext;
QFunctionArgs args;

QFunction(const std::string &source);
QFunction(const std::string &source,
const std::string &function_name);

static QFunction* getQFunction(CeedQFunction qf,
const bool assertValid = true);
Expand Down
Loading

0 comments on commit d8f39b6

Please sign in to comment.