From 7b26d314f4d1f386051dfa3a0fcf823471a1b163 Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Tue, 25 Jun 2024 17:35:27 -0600 Subject: [PATCH 1/3] allow size of inner loop to change --- src/interface/metadata.cpp | 9 --------- src/interface/update.cpp | 8 +++----- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/interface/metadata.cpp b/src/interface/metadata.cpp index ed87a00f8345..fce88334c652 100644 --- a/src/interface/metadata.cpp +++ b/src/interface/metadata.cpp @@ -263,15 +263,6 @@ bool Metadata::IsValid(bool throw_on_fail) const { } } - // Limitations on fine fields - if (CountSet({Fine, Sparse}) > 1) { - valid = false; - if (throw_on_fail) { - PARTHENON_THROW( - "Sparse deallocation routine is not written to handle fine fields."); - } - } - return valid; } diff --git a/src/interface/update.cpp b/src/interface/update.cpp index 508071c951ca..cf79d9980cba 100644 --- a/src/interface/update.cpp +++ b/src/interface/update.cpp @@ -174,13 +174,11 @@ TaskStatus SparseDealloc(MeshData *md) { const auto &var = pack(b, v); const Real threshold = var.deallocation_threshold; bool all_zero = true; + const auto &var_raw = var.data(); Kokkos::parallel_reduce( - Kokkos::TeamThreadRange<>(team_member, NkNjNi), + Kokkos::TeamThreadRange<>(team_member, var.size()), [&](const int idx, bool &lall_zero) { - const int k = kb.s + idx / NjNi; - const int j = jb.s + (idx % NjNi) / Ni; - const int i = ib.s + idx % Ni; - if (std::abs(var(k, j, i)) > threshold) { + if (std::abs(var_raw[idx]) > threshold) { lall_zero = false; return; } From 864c3a171acf55a80c0e4ce3b59bb7755079bc57 Mon Sep 17 00:00:00 2001 From: Luke Roberts Date: Tue, 25 Jun 2024 17:40:05 -0600 Subject: [PATCH 2/3] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68b3b1121818..829511724d61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ - [[PR 1004]](https://github.com/parthenon-hpc-lab/parthenon/pull/1004) Allow parameter modification from an input file for restarts ### Fixed (not changing behavior/API/variables/...) +- [[PR 1131]](https://github.com/parthenon-hpc-lab/parthenon/pull/1131) Make deallocation of fine and sparse fields work - [[PR 1127]](https://github.com/parthenon-hpc-lab/parthenon/pull/1127) Add WithFluxes to IsRefined check - [[PR 1111]](https://github.com/parthenon-hpc-lab/parthenon/pull/1111) Fix undefined behavior due to bitshift of negative number in LogicalLocation - [[PR 1092]](https://github.com/parthenon-hpc-lab/parthenon/pull/1092) Updates to DataCollection and MeshData to remove requirement of predefining MeshBlockData From 2e0e981e4253feba2efb8e3bbcc4ff346e30fd43 Mon Sep 17 00:00:00 2001 From: Philipp Grete Date: Thu, 4 Jul 2024 09:06:01 +0200 Subject: [PATCH 3/3] Add CI on AMD GPUs (#1117) * Update env vars for amd ci * Fix edge case if parthenon is located at /parthenon * First attempt at AMD GPU CI * Fix formatting * Fix userid * Attempt to debug GPU visibility in container * Update deprecated Kokkos option * use local user in docker img * Enable extendd pipelines * Change default user only for custom runners --- .github/workflows/ci-extended.yml | 58 ++++++++++++++++++++++++++++ .github/workflows/ci-short.yml | 43 +++++++++++++++++++++ CHANGELOG.md | 1 + cmake/TestSetup.cmake | 2 +- cmake/machinecfg/GitHubActions.cmake | 5 ++- scripts/docker/Dockerfile.hip-rocm | 3 ++ tst/regression/utils/test_case.py | 4 +- 7 files changed, 110 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index bfbba99ed1ca..8ca646cfc2eb 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -21,6 +21,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 + OMPI_MCA_btl_vader_single_copy_mechanism: none jobs: perf-and-regression: @@ -121,3 +123,59 @@ jobs: example/advection/ascent_render_57.png retention-days: 3 + perf-and-regression-amdgpu: + strategy: + matrix: + parallel: ['serial', 'mpi'] + runs-on: [self-hosted, navi1030] + container: + image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5 + # Map to local user id on CI machine to allow writing to build cache and + # forward device handles to access AMD GPU within container + options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined + env: + CMAKE_GENERATOR: Ninja + CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build + steps: + - uses: actions/checkout@v3 + with: + submodules: 'true' + + - name: Setup cache for gold standard + uses: actions/cache@v3 + with: + path: tst/regression/gold_standard/ + key: gold-standard + + - name: Configure + run: | + cmake -B build \ + -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMACHINE_VARIANT=hip-${{ matrix.parallel }} \ + -DCMAKE_CXX_COMPILER=hipcc + + - name: Build + run: cmake --build build + + # run performance "unit" tests (none use MPI) + - name: Performance tests + if: ${{ matrix.parallel == 'serial' }} + run: | + cd build + ctest -L performance -LE perf-reg + + # run regression tests + - name: Regression tests + run: | + cd build + ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 + + - uses: actions/upload-artifact@v3 + with: + name: log-and-convergence-${{ matrix.parallel }} + path: | + build/CMakeFiles/CMakeOutput.log + build/tst/regression/outputs/advection_convergence*/advection-errors.dat + build/tst/regression/outputs/advection_convergence*/advection-errors.png + retention-days: 3 diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index adbb56287f6e..ecb4052411ee 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -13,6 +13,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 + OMPI_MCA_btl_vader_single_copy_mechanism: none jobs: style: @@ -130,3 +132,44 @@ jobs: build/profile.txt retention-days: 3 + integration-amdgpu: + runs-on: [self-hosted, navi1030] + container: + image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5 + # Map to local user id on CI machine to allow writing to build cache and + # forward device handles to access AMD GPU within container + options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined + env: + CMAKE_GENERATOR: Ninja + CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build + steps: + - uses: actions/checkout@v3 + with: + submodules: 'true' + - name: Configure + run: | + cmake -B build \ + -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMACHINE_VARIANT=hip-mpi \ + -DCMAKE_CXX_COMPILER=hipcc + # Test example with "variables" and output + - name: advection + run: | + cmake --build build -t advection-example + cd build + ctest -R regression_mpi_test:output_hdf5 + # Test example with swarms + - name: particle-leapfrog + run: | + cmake --build build -t particle-leapfrog + cd build + ctest -R regression_mpi_test:particle_leapfrog + + - uses: actions/upload-artifact@v3 + with: + name: configure-log-integration-amdgpu + path: | + build/CMakeFiles/CMakeOutput.log + retention-days: 3 + diff --git a/CHANGELOG.md b/CHANGELOG.md index 976bf37d3454..3e391c6d746d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ - [[PR 1031]](https://github.com/parthenon-hpc-lab/parthenon/pull/1031) Fix bug in non-cell centered AMR ### Infrastructure (changes irrelevant to downstream codes) +- [[PR 1117]](https://github.com/parthenon-hpc-lab/parthenon/pull/1117) Enable CI pipelines on AMD GPUs with ROCM/HIP - [[PR 1114]](https://github.com/parthenon-hpc-lab/parthenon/pull/1114) Enable sanitizers for extended CI host build - [[PR 1123]](https://github.com/parthenon-hpc-lab/parthenon/pull/1123) Default initialize ProResInfo.dir - [[PR 1121]](https://github.com/parthenon-hpc-lab/parthenon/pull/1121) Default initialize BndInfo.dir diff --git a/cmake/TestSetup.cmake b/cmake/TestSetup.cmake index dd2f8b05ec4b..005756d3ade5 100644 --- a/cmake/TestSetup.cmake +++ b/cmake/TestSetup.cmake @@ -152,7 +152,7 @@ function(setup_test_parallel nproc dir arg extra_labels) list(APPEND labels "${extra_labels}") if(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP) - set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-num-devices=${NUM_GPU_DEVICES_PER_NODE}") + set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-map-device-id-by=mpi_rank") list(APPEND labels "cuda") endif() if (Kokkos_ENABLE_OPENMP) diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake index fc91643a1a74..663dcb38d682 100644 --- a/cmake/machinecfg/GitHubActions.cmake +++ b/cmake/machinecfg/GitHubActions.cmake @@ -29,9 +29,10 @@ if (${MACHINE_VARIANT} MATCHES "cuda") set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -Wno-unknown-cuda-version") endif() elseif (${MACHINE_VARIANT} MATCHES "hip") - # using an arbitrary arch as GitHub Action runners don't have GPUs - set(Kokkos_ARCH_VEGA908 ON CACHE BOOL "GPU architecture") + # using an arch that matches Hamilton at Hamburg Obs + set(Kokkos_ARCH_NAVI1030 ON CACHE BOOL "GPU architecture") set(Kokkos_ENABLE_HIP ON CACHE BOOL "Enable HIP") + set(Kokkos_ENABLE_ZEN3 ON CACHE BOOL "Enable Zen3") else() set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -fopenmp-simd") endif() diff --git a/scripts/docker/Dockerfile.hip-rocm b/scripts/docker/Dockerfile.hip-rocm index f586ade42104..5d9d5c765b6a 100644 --- a/scripts/docker/Dockerfile.hip-rocm +++ b/scripts/docker/Dockerfile.hip-rocm @@ -20,3 +20,6 @@ RUN cd /tmp && \ ENV LDFLAGS="-lopen-pal" RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10 + +# uid 1000 maps to the one running the container on the CI host +RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci diff --git a/tst/regression/utils/test_case.py b/tst/regression/utils/test_case.py index 12302568fbd5..7b09d00358d5 100644 --- a/tst/regression/utils/test_case.py +++ b/tst/regression/utils/test_case.py @@ -89,9 +89,7 @@ def __init__(self, run_test_path, **kwargs): try: parthenon_path = os.path.realpath(__file__) idx = parthenon_path.rindex("/parthenon/") - self.parameters.parthenon_path = os.path.join( - parthenon_path[:idx], "parthenon" - ) + self.parameters.parthenon_path = parthenon_path[: idx + 10] except ValueError: baseDir = os.path.dirname(__file__) self.parameters.parthenon_path = os.path.abspath(baseDir + "/../../../")