Skip to content

Commit

Permalink
Merge branch 'develop' into acreyes/par_reduce-flatloops
Browse files Browse the repository at this point in the history
  • Loading branch information
pgrete authored Jul 4, 2024
2 parents 56c422d + 2e0e981 commit 521419c
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 20 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/ci-extended.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ env:
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none

jobs:
perf-and-regression:
Expand Down Expand Up @@ -121,3 +123,59 @@ jobs:
example/advection/ascent_render_57.png
retention-days: 3

perf-and-regression-amdgpu:
strategy:
matrix:
parallel: ['serial', 'mpi']
runs-on: [self-hosted, navi1030]
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
# Map to local user id on CI machine to allow writing to build cache and
# forward device handles to access AMD GPU within container
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
env:
CMAKE_GENERATOR: Ninja
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'

- name: Setup cache for gold standard
uses: actions/cache@v3
with:
path: tst/regression/gold_standard/
key: gold-standard

- name: Configure
run: |
cmake -B build \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=hip-${{ matrix.parallel }} \
-DCMAKE_CXX_COMPILER=hipcc
- name: Build
run: cmake --build build

# run performance "unit" tests (none use MPI)
- name: Performance tests
if: ${{ matrix.parallel == 'serial' }}
run: |
cd build
ctest -L performance -LE perf-reg
# run regression tests
- name: Regression tests
run: |
cd build
ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
- uses: actions/upload-artifact@v3
with:
name: log-and-convergence-${{ matrix.parallel }}
path: |
build/CMakeFiles/CMakeOutput.log
build/tst/regression/outputs/advection_convergence*/advection-errors.dat
build/tst/regression/outputs/advection_convergence*/advection-errors.png
retention-days: 3
43 changes: 43 additions & 0 deletions .github/workflows/ci-short.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ env:
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none

jobs:
style:
Expand Down Expand Up @@ -130,3 +132,44 @@ jobs:
build/profile.txt
retention-days: 3

integration-amdgpu:
runs-on: [self-hosted, navi1030]
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
# Map to local user id on CI machine to allow writing to build cache and
# forward device handles to access AMD GPU within container
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
env:
CMAKE_GENERATOR: Ninja
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Configure
run: |
cmake -B build \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=hip-mpi \
-DCMAKE_CXX_COMPILER=hipcc
# Test example with "variables" and output
- name: advection
run: |
cmake --build build -t advection-example
cd build
ctest -R regression_mpi_test:output_hdf5
# Test example with swarms
- name: particle-leapfrog
run: |
cmake --build build -t particle-leapfrog
cd build
ctest -R regression_mpi_test:particle_leapfrog
- uses: actions/upload-artifact@v3
with:
name: configure-log-integration-amdgpu
path: |
build/CMakeFiles/CMakeOutput.log
retention-days: 3

2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- [[PR 1004]](https://github.com/parthenon-hpc-lab/parthenon/pull/1004) Allow parameter modification from an input file for restarts

### Fixed (not changing behavior/API/variables/...)
- [[PR 1131]](https://github.com/parthenon-hpc-lab/parthenon/pull/1131) Make deallocation of fine and sparse fields work
- [[PR 1127]](https://github.com/parthenon-hpc-lab/parthenon/pull/1127) Add WithFluxes to IsRefined check
- [[PR 1111]](https://github.com/parthenon-hpc-lab/parthenon/pull/1111) Fix undefined behavior due to bitshift of negative number in LogicalLocation
- [[PR 1092]](https://github.com/parthenon-hpc-lab/parthenon/pull/1092) Updates to DataCollection and MeshData to remove requirement of predefining MeshBlockData
Expand All @@ -57,6 +58,7 @@
- [[PR 1031]](https://github.com/parthenon-hpc-lab/parthenon/pull/1031) Fix bug in non-cell centered AMR

### Infrastructure (changes irrelevant to downstream codes)
- [[PR 1117]](https://github.com/parthenon-hpc-lab/parthenon/pull/1117) Enable CI pipelines on AMD GPUs with ROCM/HIP
- [[PR 1114]](https://github.com/parthenon-hpc-lab/parthenon/pull/1114) Enable sanitizers for extended CI host build
- [[PR 1123]](https://github.com/parthenon-hpc-lab/parthenon/pull/1123) Default initialize ProResInfo.dir
- [[PR 1121]](https://github.com/parthenon-hpc-lab/parthenon/pull/1121) Default initialize BndInfo.dir
Expand Down
2 changes: 1 addition & 1 deletion cmake/TestSetup.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ function(setup_test_parallel nproc dir arg extra_labels)
list(APPEND labels "${extra_labels}")

if(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP)
set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-num-devices=${NUM_GPU_DEVICES_PER_NODE}")
set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-map-device-id-by=mpi_rank")
list(APPEND labels "cuda")
endif()
if (Kokkos_ENABLE_OPENMP)
Expand Down
5 changes: 3 additions & 2 deletions cmake/machinecfg/GitHubActions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ if (${MACHINE_VARIANT} MATCHES "cuda")
set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -Wno-unknown-cuda-version")
endif()
elseif (${MACHINE_VARIANT} MATCHES "hip")
# using an arbitrary arch as GitHub Action runners don't have GPUs
set(Kokkos_ARCH_VEGA908 ON CACHE BOOL "GPU architecture")
# using an arch that matches Hamilton at Hamburg Obs
set(Kokkos_ARCH_NAVI1030 ON CACHE BOOL "GPU architecture")
set(Kokkos_ENABLE_HIP ON CACHE BOOL "Enable HIP")
set(Kokkos_ENABLE_ZEN3 ON CACHE BOOL "Enable Zen3")
else()
set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -fopenmp-simd")
endif()
Expand Down
3 changes: 3 additions & 0 deletions scripts/docker/Dockerfile.hip-rocm
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@ RUN cd /tmp && \
ENV LDFLAGS="-lopen-pal"

RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10

# uid 1000 maps to the one running the container on the CI host
RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci
9 changes: 0 additions & 9 deletions src/interface/metadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,15 +263,6 @@ bool Metadata::IsValid(bool throw_on_fail) const {
}
}

// Limitations on fine fields
if (CountSet({Fine, Sparse}) > 1) {
valid = false;
if (throw_on_fail) {
PARTHENON_THROW(
"Sparse deallocation routine is not written to handle fine fields.");
}
}

return valid;
}

Expand Down
8 changes: 3 additions & 5 deletions src/interface/update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,11 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
const auto &var = pack(b, v);
const Real threshold = var.deallocation_threshold;
bool all_zero = true;
const auto &var_raw = var.data();
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, NkNjNi),
Kokkos::TeamThreadRange<>(team_member, var.size()),
[&](const int idx, bool &lall_zero) {
const int k = kb.s + idx / NjNi;
const int j = jb.s + (idx % NjNi) / Ni;
const int i = ib.s + idx % Ni;
if (std::abs(var(k, j, i)) > threshold) {
if (std::abs(var_raw[idx]) > threshold) {
lall_zero = false;
return;
}
Expand Down
4 changes: 1 addition & 3 deletions tst/regression/utils/test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ def __init__(self, run_test_path, **kwargs):
try:
parthenon_path = os.path.realpath(__file__)
idx = parthenon_path.rindex("/parthenon/")
self.parameters.parthenon_path = os.path.join(
parthenon_path[:idx], "parthenon"
)
self.parameters.parthenon_path = parthenon_path[: idx + 10]
except ValueError:
baseDir = os.path.dirname(__file__)
self.parameters.parthenon_path = os.path.abspath(baseDir + "/../../../")
Expand Down

0 comments on commit 521419c

Please sign in to comment.