From 829f02130ee8922adaa40866cdfd96156fe96482 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 22 Jan 2025 18:31:13 +0100
Subject: [PATCH] Introduction of the raft::device_resources_snmg type

---
 cpp/include/raft/comms/nccl_clique.hpp        | 156 -------------
 .../raft/core/device_resources_snmg.hpp       | 217 ++++++++++++++++++
 .../raft/core/resource/nccl_clique.hpp        |  66 ------
 cpp/include/raft/core/resources.hpp           |   3 +-
 docs/source/cpp_api/core_resources.rst        |  17 ++
 5 files changed, 236 insertions(+), 223 deletions(-)
 delete mode 100644 cpp/include/raft/comms/nccl_clique.hpp
 create mode 100644 cpp/include/raft/core/device_resources_snmg.hpp
 delete mode 100644 cpp/include/raft/core/resource/nccl_clique.hpp
diff --git a/cpp/include/raft/comms/nccl_clique.hpp b/cpp/include/raft/comms/nccl_clique.hpp
deleted file mode 100644
index c6520af753..0000000000
--- a/cpp/include/raft/comms/nccl_clique.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/device_resources.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <nccl.h>
-
-/**
- * @brief Error checking macro for NCCL runtime API functions.
- *
- * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
- * exception detailing the NCCL error that occurred
- */
-#define RAFT_NCCL_TRY(call)                        \
-  do {                                             \
-    ncclResult_t const status = (call);            \
-    if (ncclSuccess != status) {                   \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "NCCL error encountered at: ", \
-                    "call='%s', Reason=%d:%s",     \
-                    #call,                         \
-                    status,                        \
-                    ncclGetErrorString(status));   \
-      throw raft::logic_error(msg);                \
-    }                                              \
-  } while (0);
-
-namespace raft::comms {
-void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank);
-}
-
-namespace raft::comms {
-
-struct nccl_clique {
-  using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-
-  /**
-   * Instantiates a NCCL clique with all available GPUs
-   *
-   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
-   *
-   */
-  nccl_clique(int percent_of_free_memory = 80)
-    : root_rank_(0),
-      percent_of_free_memory_(percent_of_free_memory),
-      per_device_pools_(0),
-      device_resources_(0)
-  {
-    cudaGetDeviceCount(&num_ranks_);
-    device_ids_.resize(num_ranks_);
-    std::iota(device_ids_.begin(), device_ids_.end(), 0);
-    nccl_comms_.resize(num_ranks_);
-    nccl_clique_init();
-  }
-
-  /**
-   * Instantiates a NCCL clique
-   *
-   * Usage example:
-   * @code{.cpp}
-   * int n_devices;
-   * cudaGetDeviceCount(&n_devices);
-   * std::vector<int> device_ids(n_devices);
-   * std::iota(device_ids.begin(), device_ids.end(), 0);
-   * cuvs::neighbors::mg::nccl_clique& clique(device_ids); // first device is the root rank
-   * @endcode
-   *
-   * @param[in] device_ids list of device IDs to be used to initiate the clique
-   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
-   *
-   */
-  nccl_clique(const std::vector<int>& device_ids, int percent_of_free_memory = 80)
-    : root_rank_(0),
-      num_ranks_(device_ids.size()),
-      percent_of_free_memory_(percent_of_free_memory),
-      device_ids_(device_ids),
-      nccl_comms_(device_ids.size()),
-      per_device_pools_(0),
-      device_resources_(0)
-  {
-    nccl_clique_init();
-  }
-
-  void nccl_clique_init()
-  {
-    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), num_ranks_, device_ids_.data()));
-
-    for (int rank = 0; rank < num_ranks_; rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
-
-      // create a pool memory resource for each device
-      auto old_mr = rmm::mr::get_current_device_resource();
-      per_device_pools_.push_back(std::make_unique<pool_mr>(
-        old_mr, rmm::percent_of_free_device_memory(percent_of_free_memory_)));
-      rmm::cuda_device_id id(device_ids_[rank]);
-      rmm::mr::set_per_device_resource(id, per_device_pools_.back().get());
-
-      // create a device resource handle for each device
-      device_resources_.emplace_back();
-
-      // add NCCL communications to the device resource handle
-      raft::comms::build_comms_nccl_only(
-        &device_resources_[rank], nccl_comms_[rank], num_ranks_, rank);
-    }
-
-    for (int rank = 0; rank < num_ranks_; rank++) {
-      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
-      raft::resource::sync_stream(device_resources_[rank]);
-    }
-  }
-
-  const raft::device_resources& set_current_device_to_root_rank() const
-  {
-    int root_device_id = device_ids_[root_rank_];
-    RAFT_CUDA_TRY(cudaSetDevice(root_device_id));
-    return device_resources_[root_rank_];
-  }
-
-  ~nccl_clique()
-  {
-#pragma omp parallel for  // necessary to avoid hangs
-    for (int rank = 0; rank < num_ranks_; rank++) {
-      cudaSetDevice(device_ids_[rank]);
-      ncclCommDestroy(nccl_comms_[rank]);
-      rmm::cuda_device_id id(device_ids_[rank]);
-      rmm::mr::set_per_device_resource(id, nullptr);
-    }
-  }
-
-  int root_rank_;
-  int num_ranks_;
-  int percent_of_free_memory_;
-  std::vector<int> device_ids_;
-  std::vector<ncclComm_t> nccl_comms_;
-  std::vector<std::shared_ptr<pool_mr>> per_device_pools_;
-  std::vector<raft::device_resources> device_resources_;
-};
-
-}  // namespace raft::comms
diff --git a/cpp/include/raft/core/device_resources_snmg.hpp b/cpp/include/raft/core/device_resources_snmg.hpp
new file mode 100644
index 0000000000..f20a81a1c6
--- /dev/null
+++ b/cpp/include/raft/core/device_resources_snmg.hpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_resources.hpp>
+
+#include <nccl.h>
+#include <omp.h>
+
+#include <memory>
+#include <vector>
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define RAFT_NCCL_TRY(call)                        \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
+  } while (0);
+
+namespace raft {
+
+/**
+ * @brief SNMG (single-node multi-GPU) resource container object that stores a NCCL clique and all
+ * necessary resources used for calling device functions, cuda kernels, libraries and/or NCCL
+ * communications on each GPU. Note the `device_resources_snmg` object can also be used as a classic
+ * `device_resources` object. The associated resources will be the ones of the GPU used during
+ * object instantiation and a GPU switch operation will be ordered during the retrieval of said
+ * resources.
+ *
+ * The `device_resources_snmg` class is intended to be used in a single process to manage several
+ * GPUs. Please note that NCCL communications are the responsibility of the user. Blocking NCCL
+ * calls will sometimes require the use of several threads to avoid hangs.
+ */
+class device_resources_snmg : public device_resources {
+ public:
+  /**
+   * @brief Construct a SNMG resources instance with all available GPUs
+   */
+  device_resources_snmg() : device_resources(), root_rank_(0)
+  {
+    cudaGetDevice(&main_gpu_id_);
+
+    int num_ranks;
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_ranks));
+    device_ids_.resize(num_ranks);
+    std::iota(device_ids_.begin(), device_ids_.end(), 0);
+    nccl_comms_.resize(num_ranks);
+    initialize();
+  }
+
+  /**
+   * @brief Construct a SNMG resources instance with a subset of available GPUs
+   *
+   * @param[in] device_ids List of device IDs to be used by the NCCL clique
+   */
+  device_resources_snmg(const std::vector<int>& device_ids)
+    : device_resources(), root_rank_(0), device_ids_(device_ids), nccl_comms_(device_ids.size())
+  {
+    cudaGetDevice(&main_gpu_id_);
+
+    initialize();
+  }
+
+  /**
+   * @brief SNMG resources instance copy constructor
+   *
+   * @param[in] clique A SNMG resources instance
+   */
+  device_resources_snmg(const device_resources_snmg& clique)
+    : device_resources(clique),
+      root_rank_(clique.root_rank_),
+      main_gpu_id_(clique.main_gpu_id_),
+      device_ids_(clique.device_ids_),
+      nccl_comms_(clique.nccl_comms_),
+      device_resources_(clique.device_resources_)
+  {
+  }
+
+  device_resources_snmg(device_resources_snmg&&)            = delete;
+  device_resources_snmg& operator=(device_resources_snmg&&) = delete;
+
+  /**
+   * @brief Set root rank of NCCL clique
+   */
+  inline int set_root_rank(int rank) { this->root_rank_ = rank; }
+
+  /**
+   * @brief Get root rank of NCCL clique
+   */
+  inline int get_root_rank() const { return this->root_rank_; }
+
+  /**
+   * @brief Get number of ranks in NCCL clique
+   */
+  inline int get_num_ranks() const { return this->device_ids_.size(); }
+
+  /**
+   * @brief Get device ID of rank in NCCL clique
+   */
+  inline int get_device_id(int rank) const { return this->device_ids_[rank]; }
+
+  /**
+   * @brief Get NCCL comm object of rank in NCCL clique
+   */
+  inline ncclComm_t get_nccl_comm(int rank) const { return this->nccl_comms_[rank]; }
+
+  /**
+   * @brief Get raft::device_resources object of rank in NCCL clique
+   */
+  inline const raft::device_resources& get_device_resources(int rank) const
+  {
+    return this->device_resources_[rank];
+  }
+
+  /**
+   * @brief Set current device ID to root rank and return its raft::device_resources object
+   */
+  inline const raft::device_resources& set_current_device_to_root_rank() const
+  {
+    return set_current_device_to_rank(get_root_rank());
+  }
+
+  /**
+   * @brief Set current device ID to rank and return its raft::device_resources object
+   */
+  inline const raft::device_resources& set_current_device_to_rank(int rank) const
+  {
+    RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+    return get_device_resources(rank);
+  }
+
+  /**
+   * @brief Set a memory pool on all GPUs of the clique
+   */
+  void set_memory_pool(int percent_of_free_memory) const
+  {
+    for (int rank = 0; rank < get_num_ranks(); rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+      size_t limit =
+        rmm::percent_of_free_device_memory(percent_of_free_memory);  // check limit for each device
+      raft::resource::set_workspace_to_pool_resource(get_device_resources(rank), limit);
+    }
+    cudaSetDevice(this->main_gpu_id_);
+  }
+
+  bool has_resource_factory(resource::resource_type resource_type) const override
+  {
+    cudaSetDevice(this->main_gpu_id_);
+    return raft::resources::has_resource_factory(resource_type);
+  }
+
+  /** Destroys all held-up resources */
+  ~device_resources_snmg()
+  {
+#pragma omp parallel for  // necessary to avoid hangs
+    for (int rank = 0; rank < get_num_ranks(); rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+      RAFT_NCCL_TRY(ncclCommDestroy(get_nccl_comm(rank)));
+    }
+    cudaSetDevice(this->main_gpu_id_);
+  }
+
+ private:
+  /**
+   * @brief Initializes the NCCL clique and raft::device_resources objects
+   */
+  void initialize()
+  {
+    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), get_num_ranks(), device_ids_.data()));
+
+    for (int rank = 0; rank < get_num_ranks(); rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(get_device_id(rank)));
+      device_resources_.emplace_back();
+
+      // ideally add the ncclComm_t to the device_resources object with
+      // raft::comms::build_comms_nccl_only
+    }
+    cudaSetDevice(this->main_gpu_id_);
+  }
+
+  int root_rank_;
+  int main_gpu_id_;
+  std::vector<int> device_ids_;
+  std::vector<ncclComm_t> nccl_comms_;
+  std::vector<raft::device_resources> device_resources_;
+
+};  // class device_resources_snmg
+
+}  // namespace raft
diff --git a/cpp/include/raft/core/resource/nccl_clique.hpp b/cpp/include/raft/core/resource/nccl_clique.hpp
deleted file mode 100644
index edda5043ae..0000000000
--- a/cpp/include/raft/core/resource/nccl_clique.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/comms/nccl_clique.hpp>
-#include <raft/core/resource/resource_types.hpp>
-#include <raft/core/resources.hpp>
-
-#include <memory>
-
-namespace raft::resource {
-
-class nccl_clique_resource : public resource {
- public:
-  nccl_clique_resource() : clique_(std::make_unique<raft::comms::nccl_clique>()) {}
-  ~nccl_clique_resource() override {}
-  void* get_resource() override { return clique_.get(); }
-
- private:
-  std::unique_ptr<raft::comms::nccl_clique> clique_;
-};
-
-/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
-class nccl_clique_resource_factory : public resource_factory {
- public:
-  resource_type get_resource_type() override { return resource_type::NCCL_CLIQUE; }
-  resource* make_resource() override { return new nccl_clique_resource(); }
-};
-
-/**
- * @defgroup nccl_clique_resource resource functions
- * @{
- */
-
-/**
- * Retrieves a NCCL clique from raft res if it exists, otherwise initializes it and return it.
- *
- * @param[in] res the raft resources object
- * @return NCCL clique
- */
-inline const raft::comms::nccl_clique& get_nccl_clique(resources const& res)
-{
-  if (!res.has_resource_factory(resource_type::NCCL_CLIQUE)) {
-    res.add_resource_factory(std::make_shared<nccl_clique_resource_factory>());
-  }
-  return *res.get_resource<raft::comms::nccl_clique>(resource_type::NCCL_CLIQUE);
-};
-
-/**
- * @}
- */
-
-}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
index b0827d8e11..44525edb23 100644
--- a/cpp/include/raft/core/resources.hpp
+++ b/cpp/include/raft/core/resources.hpp
@@ -72,6 +72,7 @@ class resources {
   resources(const resources& res) : factories_(res.factories_), resources_(res.resources_) {}
   resources(resources&&)            = delete;
   resources& operator=(resources&&) = delete;
+  virtual ~resources() {}
 
   /**
    * @brief Returns true if a resource_factory has been registered for the
@@ -79,7 +80,7 @@ class resources {
    * @param resource_type resource type to check
    * @return true if resource_factory is registered for the given resource_type
    */
-  bool has_resource_factory(resource::resource_type resource_type) const
+  virtual bool has_resource_factory(resource::resource_type resource_type) const
   {
     std::lock_guard<std::mutex> _(mutex_);
     return factories_.at(resource_type).first != resource::resource_type::LAST_KEY;
diff --git a/docs/source/cpp_api/core_resources.rst b/docs/source/cpp_api/core_resources.rst
index 0da11acae6..3c242af848 100644
--- a/docs/source/cpp_api/core_resources.rst
+++ b/docs/source/cpp_api/core_resources.rst
@@ -55,6 +55,23 @@ namespace *raft::core*
     :project: RAFT
     :members:
 
+SNMG Device Resources
+---------------------
+
+The `raft::device_resources_snmg` provides a convenient way to design SNMG
+(single-node multi-GPU) algorithms. It initiates device-related resources
+for a set of devices forming clique. This includes NCCL communications.
+GPUs can be addressed and exchanges be made over multiple threads
+for performance or convenience.
+
+``#include <raft/core/device_resources_snmg.hpp>``
+
+namespace *raft::core*
+
+.. doxygenclass:: raft::device_resources_snmg
+    :project: RAFT
+    :members:
+
 Resource Functions
 ------------------