From 7b26d314f4d1f386051dfa3a0fcf823471a1b163 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 25 Jun 2024 17:35:27 -0600
Subject: [PATCH 1/3] allow size of inner loop to change

---
 src/interface/metadata.cpp | 9 ---------
 src/interface/update.cpp   | 8 +++-----
 2 files changed, 3 insertions(+), 14 deletions(-)
diff --git a/src/interface/metadata.cpp b/src/interface/metadata.cpp
index ed87a00f8345..fce88334c652 100644
--- a/src/interface/metadata.cpp
+++ b/src/interface/metadata.cpp
@@ -263,15 +263,6 @@ bool Metadata::IsValid(bool throw_on_fail) const {
     }
   }
 
-  // Limitations on fine fields
-  if (CountSet({Fine, Sparse}) > 1) {
-    valid = false;
-    if (throw_on_fail) {
-      PARTHENON_THROW(
-          "Sparse deallocation routine is not written to handle fine fields.");
-    }
-  }
-
   return valid;
 }
 
diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index 508071c951ca..cf79d9980cba 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -174,13 +174,11 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
           const auto &var = pack(b, v);
           const Real threshold = var.deallocation_threshold;
           bool all_zero = true;
+          const auto &var_raw = var.data();
           Kokkos::parallel_reduce(
-              Kokkos::TeamThreadRange<>(team_member, NkNjNi),
+              Kokkos::TeamThreadRange<>(team_member, var.size()),
               [&](const int idx, bool &lall_zero) {
-                const int k = kb.s + idx / NjNi;
-                const int j = jb.s + (idx % NjNi) / Ni;
-                const int i = ib.s + idx % Ni;
-                if (std::abs(var(k, j, i)) > threshold) {
+                if (std::abs(var_raw[idx]) > threshold) {
                   lall_zero = false;
                   return;
                 }

From 864c3a171acf55a80c0e4ce3b59bb7755079bc57 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 25 Jun 2024 17:40:05 -0600
Subject: [PATCH 2/3] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68b3b1121818..829511724d61 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,7 @@
 - [[PR 1004]](https://github.com/parthenon-hpc-lab/parthenon/pull/1004) Allow parameter modification from an input file for restarts
 
 ### Fixed (not changing behavior/API/variables/...)
+- [[PR 1131]](https://github.com/parthenon-hpc-lab/parthenon/pull/1131) Make deallocation of fine and sparse fields work
 - [[PR 1127]](https://github.com/parthenon-hpc-lab/parthenon/pull/1127) Add WithFluxes to IsRefined check
 - [[PR 1111]](https://github.com/parthenon-hpc-lab/parthenon/pull/1111) Fix undefined behavior due to bitshift of negative number in LogicalLocation
 - [[PR 1092]](https://github.com/parthenon-hpc-lab/parthenon/pull/1092) Updates to DataCollection and MeshData to remove requirement of predefining MeshBlockData

From 2e0e981e4253feba2efb8e3bbcc4ff346e30fd43 Mon Sep 17 00:00:00 2001
From: Philipp Grete <pgrete@hs.uni-hamburg.de>
Date: Thu, 4 Jul 2024 09:06:01 +0200
Subject: [PATCH 3/3] Add CI on AMD GPUs (#1117)

* Update env vars for amd ci

* Fix edge case if parthenon is located at /parthenon

* First attempt at AMD GPU CI

* Fix formatting

* Fix userid

* Attempt to debug GPU visibility in container

* Update deprecated Kokkos option

* use local user in docker img

* Enable extendd pipelines

* Change default user only for custom runners
---
 .github/workflows/ci-extended.yml    | 58 ++++++++++++++++++++++++++++
 .github/workflows/ci-short.yml       | 43 +++++++++++++++++++++
 CHANGELOG.md                         |  1 +
 cmake/TestSetup.cmake                |  2 +-
 cmake/machinecfg/GitHubActions.cmake |  5 ++-
 scripts/docker/Dockerfile.hip-rocm   |  3 ++
 tst/regression/utils/test_case.py    |  4 +-
 7 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml
index bfbba99ed1ca..8ca646cfc2eb 100644
--- a/.github/workflows/ci-extended.yml
+++ b/.github/workflows/ci-extended.yml
@@ -21,6 +21,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
 
 jobs:
   perf-and-regression:
@@ -121,3 +123,59 @@ jobs:
             example/advection/ascent_render_57.png
           retention-days: 3
 
+  perf-and-regression-amdgpu:
+    strategy:
+      matrix:
+        parallel: ['serial', 'mpi']
+    runs-on: [self-hosted, navi1030]
+    container:
+      image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
+      # Map to local user id on CI  machine to allow writing to build cache and
+      # forward device handles to access AMD GPU within container
+      options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
+    env:
+      CMAKE_GENERATOR: Ninja
+      CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'true'
+
+      - name: Setup cache for gold standard
+        uses: actions/cache@v3
+        with:
+          path: tst/regression/gold_standard/
+          key: gold-standard
+
+      - name: Configure
+        run: |
+          cmake -B build \
+            -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DMACHINE_VARIANT=hip-${{ matrix.parallel }} \
+            -DCMAKE_CXX_COMPILER=hipcc
+
+      - name: Build
+        run: cmake --build build
+
+      # run performance "unit" tests (none use MPI)
+      - name: Performance tests
+        if: ${{ matrix.parallel == 'serial' }}
+        run: |
+          cd build
+          ctest -L performance -LE perf-reg
+
+      # run regression tests
+      - name: Regression tests
+        run: |
+          cd build
+          ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: log-and-convergence-${{ matrix.parallel }}
+          path: |
+            build/CMakeFiles/CMakeOutput.log
+            build/tst/regression/outputs/advection_convergence*/advection-errors.dat
+            build/tst/regression/outputs/advection_convergence*/advection-errors.png
+          retention-days: 3
diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml
index adbb56287f6e..ecb4052411ee 100644
--- a/.github/workflows/ci-short.yml
+++ b/.github/workflows/ci-short.yml
@@ -13,6 +13,8 @@ env:
   CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
   MACHINE_CFG: cmake/machinecfg/CI.cmake
   OMPI_MCA_mpi_common_cuda_event_max: 1000
+  # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
 
 jobs:
   style:
@@ -130,3 +132,44 @@ jobs:
             build/profile.txt
           retention-days: 3
 
+  integration-amdgpu:
+    runs-on: [self-hosted, navi1030]
+    container:
+      image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
+      # Map to local user id on CI  machine to allow writing to build cache and
+      # forward device handles to access AMD GPU within container
+      options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
+    env:
+      CMAKE_GENERATOR: Ninja
+      CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'true'
+      - name: Configure
+        run: |
+          cmake -B build \
+            -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DMACHINE_VARIANT=hip-mpi \
+            -DCMAKE_CXX_COMPILER=hipcc
+      # Test example with "variables" and output
+      - name: advection
+        run: |
+          cmake --build build -t advection-example
+          cd build
+          ctest -R regression_mpi_test:output_hdf5
+      # Test example with swarms
+      - name: particle-leapfrog
+        run: |
+          cmake --build build -t particle-leapfrog
+          cd build
+          ctest -R regression_mpi_test:particle_leapfrog
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: configure-log-integration-amdgpu
+          path: |
+            build/CMakeFiles/CMakeOutput.log
+          retention-days: 3
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 976bf37d3454..3e391c6d746d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -57,6 +57,7 @@
 - [[PR 1031]](https://github.com/parthenon-hpc-lab/parthenon/pull/1031) Fix bug in non-cell centered AMR
 
 ### Infrastructure (changes irrelevant to downstream codes)
+- [[PR 1117]](https://github.com/parthenon-hpc-lab/parthenon/pull/1117) Enable CI pipelines on AMD GPUs with ROCM/HIP
 - [[PR 1114]](https://github.com/parthenon-hpc-lab/parthenon/pull/1114) Enable sanitizers for extended CI host build
 - [[PR 1123]](https://github.com/parthenon-hpc-lab/parthenon/pull/1123) Default initialize ProResInfo.dir
 - [[PR 1121]](https://github.com/parthenon-hpc-lab/parthenon/pull/1121) Default initialize BndInfo.dir
diff --git a/cmake/TestSetup.cmake b/cmake/TestSetup.cmake
index dd2f8b05ec4b..005756d3ade5 100644
--- a/cmake/TestSetup.cmake
+++ b/cmake/TestSetup.cmake
@@ -152,7 +152,7 @@ function(setup_test_parallel nproc dir arg extra_labels)
     list(APPEND labels "${extra_labels}")
 
     if(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP)
-      set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-num-devices=${NUM_GPU_DEVICES_PER_NODE}")
+      set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-map-device-id-by=mpi_rank")
       list(APPEND labels "cuda")
     endif()
     if (Kokkos_ENABLE_OPENMP)
diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake
index fc91643a1a74..663dcb38d682 100644
--- a/cmake/machinecfg/GitHubActions.cmake
+++ b/cmake/machinecfg/GitHubActions.cmake
@@ -29,9 +29,10 @@ if (${MACHINE_VARIANT} MATCHES "cuda")
     set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -Wno-unknown-cuda-version")
   endif()
 elseif (${MACHINE_VARIANT} MATCHES "hip")
-  # using an arbitrary arch as GitHub Action runners don't have GPUs
-  set(Kokkos_ARCH_VEGA908 ON CACHE BOOL "GPU architecture")
+  # using an arch that matches Hamilton at Hamburg Obs
+  set(Kokkos_ARCH_NAVI1030 ON CACHE BOOL "GPU architecture")
   set(Kokkos_ENABLE_HIP ON CACHE BOOL "Enable HIP")
+  set(Kokkos_ENABLE_ZEN3 ON CACHE BOOL "Enable Zen3")
 else()
   set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -fopenmp-simd")
 endif()
diff --git a/scripts/docker/Dockerfile.hip-rocm b/scripts/docker/Dockerfile.hip-rocm
index f586ade42104..5d9d5c765b6a 100644
--- a/scripts/docker/Dockerfile.hip-rocm
+++ b/scripts/docker/Dockerfile.hip-rocm
@@ -20,3 +20,6 @@ RUN cd /tmp && \
 ENV LDFLAGS="-lopen-pal"
 
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+
+# uid 1000 maps to the one running the container on the CI host
+RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci
diff --git a/tst/regression/utils/test_case.py b/tst/regression/utils/test_case.py
index 12302568fbd5..7b09d00358d5 100644
--- a/tst/regression/utils/test_case.py
+++ b/tst/regression/utils/test_case.py
@@ -89,9 +89,7 @@ def __init__(self, run_test_path, **kwargs):
         try:
             parthenon_path = os.path.realpath(__file__)
             idx = parthenon_path.rindex("/parthenon/")
-            self.parameters.parthenon_path = os.path.join(
-                parthenon_path[:idx], "parthenon"
-            )
+            self.parameters.parthenon_path = parthenon_path[: idx + 10]
         except ValueError:
             baseDir = os.path.dirname(__file__)
             self.parameters.parthenon_path = os.path.abspath(baseDir + "/../../../")