From a626fd0b8efe5a4b726dbe66945ddc49ec5c2a0b Mon Sep 17 00:00:00 2001 From: Felixrccs Date: Fri, 1 Dec 2023 10:19:46 +0100 Subject: [PATCH 1/2] change to universe logic in MPI distribution --- src/ML-MACE/pair_mace.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp index a9290034254..d6873d6d070 100644 --- a/src/ML-MACE/pair_mace.cpp +++ b/src/ML-MACE/pair_mace.cpp @@ -25,6 +25,7 @@ #include "memory.h" #include "neigh_list.h" #include "neighbor.h" +#include "universe.h" #include #include @@ -308,7 +309,7 @@ void PairMACE::coeff(int narg, char **arg) //int worldrank; //MPI_Comm_rank(world, &worldrank); MPI_Comm local; - MPI_Comm_split_type(world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local); + MPI_Comm_split_type(universe->uworld, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local); int localrank; MPI_Comm_rank(local, &localrank); device = c10::Device(torch::kCUDA,localrank); From bd3d6a49cc0613f0a583897f27a95a58feae4b1c Mon Sep 17 00:00:00 2001 From: Felixrccs Date: Mon, 4 Dec 2023 16:06:15 +0100 Subject: [PATCH 2/2] final MPI over gpu distribution debug --- src/ML-MACE/pair_mace.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp index d6873d6d070..f8888cd66af 100644 --- a/src/ML-MACE/pair_mace.cpp +++ b/src/ML-MACE/pair_mace.cpp @@ -27,6 +27,7 @@ #include "neighbor.h" #include "universe.h" +#include #include #include #include @@ -312,7 +313,9 @@ void PairMACE::coeff(int narg, char **arg) MPI_Comm_split_type(universe->uworld, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local); int localrank; MPI_Comm_rank(local, &localrank); - device = c10::Device(torch::kCUDA,localrank); + int nDevices; + cudaGetDeviceCount(&nDevices); + device = c10::Device(torch::kCUDA,localrank % nDevices); } std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";