HNSW GPU hierarchy (#616)

Authors: - Divye Gala (https://github.com/divyegala) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) URL: #616
rapidsai · Feb 6, 2025 · 47f5368 · 47f5368
1 parent bf86cde
commit 47f5368
Show file tree

Hide file tree

Showing 7 changed files with 261 additions and 30 deletions.
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -33,6 +33,8 @@ void parse_build_param(const nlohmann::json& conf,
       param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE;
     } else if (conf.at("hierarchy") == "cpu") {
       param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
+    } else if (conf.at("hierarchy") == "gpu") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU;
     } else {
       THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
     }

diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
@@ -42,17 +42,22 @@ enum cuvsHnswHierarchy {
   /* Flat hierarchy, search is base-layer only */
   NONE,
   /* Full hierarchy is built using the CPU */
-  CPU
+  CPU,
+  /* Full hierarchy is built using the GPU */
+  GPU
 };
 
 struct cuvsHnswIndexParams {
   /* hierarchy of the hnsw index */
   enum cuvsHnswHierarchy hierarchy;
   /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
   int ef_construction;
-  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
-      When the value is 0, the number of threads is automatically determined to the maximum
-      number of threads available.
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU` or `GPU`.
+      When the value is 0, the number of threads is automatically determined to the
+      maximum number of threads available.
+      NOTE: When hierarchy is `GPU`, while the majority of the work is done on the GPU,
+      initialization of the HNSW index itself and some other work
+      is parallelized with the help of CPU threads.
   */
   int num_threads;
 };

diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -45,17 +45,21 @@ namespace cuvs::neighbors::hnsw {
  */
 enum class HnswHierarchy {
   NONE,  // base-layer-only index
-  CPU    // full index with CPU-built hierarchy
+  CPU,   // full index with CPU-built hierarchy
+  GPU    // full index with GPU-built hierarchy
 };
 
 struct index_params : cuvs::neighbors::index_params {
   /** Hierarchy build type for HNSW index when converting from CAGRA index */
   HnswHierarchy hierarchy = HnswHierarchy::NONE;
   /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
   int ef_construction = 200;
-  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU` or `GPU`.
       When the value is 0, the number of threads is automatically determined to the
       maximum number of threads available.
+      NOTE: When hierarchy is `GPU`, while the majority of the work is done on the GPU,
+      initialization of the HNSW index itself and some other work
+      is parallelized with the help of CPU threads.
    */
   int num_threads = 0;
 };

diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuvs/neighbors/brute_force.hpp>
 #include <cuvs/neighbors/hnsw.hpp>
 #include <filesystem>
 #include <hnswlib/hnswalg.h>
@@ -28,6 +29,11 @@
 
 namespace cuvs::neighbors::hnsw::detail {
 
+// This is needed as hnswlib hardcodes the distance type to float
+// or int32_t in certain places. However, we can solve uint8 or int8
+// natively with the pacth cuVS applies. We could potentially remove
+// all the hardcodes and propagate templates throughout hnswlib, but
+// as of now it's not needed.
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -136,7 +142,6 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
   const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
   std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
 {
-  // auto host_dataset = raft::make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
   auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
   raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
     host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
@@ -179,24 +184,29 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
   }
   appr_algo->base_layer_init = true;  // reset to true to allow addition of new points
 
-  // move cagra graph to host
-  auto graph = cagra_index.graph();
-  auto host_graph =
-    raft::make_host_matrix<uint32_t, int64_t, raft::row_major>(graph.extent(0), graph.extent(1));
-  raft::copy(host_graph.data_handle(),
-             graph.data_handle(),
-             graph.size(),
-             raft::resource::get_cuda_stream(res));
-  raft::resource::sync_stream(res);
+  // move cagra graph to host or access it from host if available
+  auto host_graph_view = cagra_index.graph();
+  auto host_graph      = raft::make_host_matrix<uint32_t, int64_t>(0, 0);
+  if (!raft::is_host_accessible(raft::memory_type_from_pointer(host_graph_view.data_handle()))) {
+    // copy cagra graph to host
+    host_graph = raft::make_host_matrix<uint32_t, int64_t>(host_graph_view.extent(0),
+                                                           host_graph_view.extent(1));
+    raft::copy(host_graph.data_handle(),
+               host_graph_view.data_handle(),
+               host_graph_view.size(),
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    host_graph_view = host_graph.view();
+  }
 
 // copy cagra graph to hnswlib base layer
-#pragma omp parallel for
-  for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
+#pragma omp parallel for num_threads(num_threads)
+  for (size_t i = 0; i < static_cast<size_t>(host_graph_view.extent(0)); ++i) {
     auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second;
     auto ll_i             = appr_algo->get_linklist0(hnsw_internal_id);
-    appr_algo->setListCount(ll_i, host_graph.extent(1));
+    appr_algo->setListCount(ll_i, host_graph_view.extent(1));
     auto* data = (uint32_t*)(ll_i + 1);
-    for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
+    for (size_t j = 0; j < static_cast<size_t>(host_graph_view.extent(1)); ++j) {
       auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second;
       data[j]                   = neighbor_internal_id;
     }
@@ -206,6 +216,202 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
   return hnsw_index;
 }
 
+template <typename T, typename DistT>
+int initialize_point_in_hnsw(hnswlib::HierarchicalNSW<DistT>* appr_algo,
+                             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
+                             int64_t real_index,
+                             int curlevel)
+{
+  auto cur_c                        = appr_algo->cur_element_count++;
+  appr_algo->element_levels_[cur_c] = curlevel;
+  memset(appr_algo->data_level0_memory_ + cur_c * appr_algo->size_data_per_element_ +
+           appr_algo->offsetLevel0_,
+         0,
+         appr_algo->size_data_per_element_);
+
+  // Initialisation of the data and label
+  memcpy(appr_algo->getExternalLabeLp(cur_c), &real_index, sizeof(hnswlib::labeltype));
+  memcpy(appr_algo->getDataByInternalId(cur_c),
+         dataset.data_handle() + real_index * dataset.extent(1),
+         appr_algo->data_size_);
+
+  if (curlevel) {
+    appr_algo->linkLists_[cur_c] = (char*)malloc(appr_algo->size_links_per_element_ * curlevel + 1);
+    if (appr_algo->linkLists_[cur_c] == nullptr)
+      throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist");
+    memset(appr_algo->linkLists_[cur_c], 0, appr_algo->size_links_per_element_ * curlevel + 1);
+  }
+  return cur_c;
+}
+
+template <typename T>
+void all_neighbors_graph(raft::resources const& res,
+                         raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
+                         raft::host_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+                         cuvs::distance::DistanceType metric)
+{
+  nn_descent::index_params nn_params;
+  nn_params.graph_degree              = neighbors.extent(1);
+  nn_params.intermediate_graph_degree = neighbors.extent(1) * 2;
+  nn_params.metric                    = metric;
+  nn_params.return_distances          = false;
+  auto nn_index                       = nn_descent::build(res, nn_params, dataset, neighbors);
+}
+
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
+  raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
+    host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
+  if (dataset.has_value()) {
+    host_dataset_view = dataset.value();
+  } else {
+    // move dataset to host, remove padding
+    auto cagra_dataset = cagra_index.dataset();
+    host_dataset =
+      raft::make_host_matrix<T, int64_t>(cagra_dataset.extent(0), cagra_dataset.extent(1));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.data_handle(),
+                                    sizeof(T) * cagra_dataset.stride(0),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.extent(0),
+                                    cudaMemcpyDefault,
+                                    raft::resource::get_cuda_stream(res)));
+    raft::resource::sync_stream(res);
+    host_dataset_view = host_dataset.view();
+  }
+
+  // initialize hnsw index
+  auto hnsw_index =
+    std::make_unique<index_impl<T>>(host_dataset_view.extent(1), cagra_index.metric(), hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(),
+    host_dataset_view.extent(0),
+    cagra_index.graph().extent(1) / 2,
+    params.ef_construction);
+
+  // assign a level to each point and initialize the points in hnsw
+  std::vector<size_t> levels(host_dataset_view.extent(0));
+  std::vector<uint32_t> hnsw_internal_ids(host_dataset_view.extent(0));
+
+  auto num_threads = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
+#pragma omp parallel for num_threads(num_threads)
+  for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
+    levels[i] = appr_algo->getRandomLevel(appr_algo->mult_) + 1;
+    hnsw_internal_ids[i] =
+      initialize_point_in_hnsw(appr_algo.get(), host_dataset_view, i, levels[i] - 1);
+  }
+
+  // sort the points by levels
+  // build histogram
+  std::vector<size_t> hist;
+  std::vector<size_t> order(host_dataset_view.extent(0));
+  for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
+    auto pt_level = levels[i] - 1;
+    while (pt_level >= hist.size())
+      hist.push_back(0);
+    hist[pt_level]++;
+  }
+
+  // accumulate
+  std::vector<size_t> offsets(hist.size() + 1, 0);
+  for (size_t i = 0; i < hist.size() - 1; i++) {
+    offsets[i + 1] = offsets[i] + hist[i];
+  }
+
+  // bucket sort
+  for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
+    auto pt_level              = levels[i] - 1;
+    order[offsets[pt_level]++] = i;
+  }
+
+  // set last point of the highest level as the entry point
+  appr_algo->enterpoint_node_ = hnsw_internal_ids[order.back()];
+  appr_algo->maxlevel_        = hist.size() - 1;
+
+  // iterate over the points in the descending order of their levels
+  for (size_t pt_level = hist.size() - 1; pt_level >= 1; pt_level--) {
+    auto start_idx     = offsets[pt_level - 1];
+    auto end_idx       = offsets[hist.size() - 1];
+    auto num_pts       = end_idx - start_idx;
+    auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1;
+    if (num_pts <= 1) {
+      // this means only 1 point in the level
+      continue;
+    }
+
+    // gather points from dataset to form query set on host
+    auto host_query_set = raft::make_host_matrix<T, int64_t>(num_pts, host_dataset_view.extent(1));
+    // TODO: Use `raft::matrix::gather` when available as a public API
+    // Issue: https://github.com/rapidsai/raft/issues/2572
+#pragma omp parallel for num_threads(num_threads)
+    for (auto i = start_idx; i < end_idx; i++) {
+      auto pt_id = order[i];
+      std::copy(&host_dataset_view(pt_id, 0),
+                &host_dataset_view(pt_id + 1, 0),
+                &host_query_set(i - start_idx, 0));
+    }
+
+    // find neighbors of the query set
+    auto host_neighbors = raft::make_host_matrix<uint32_t, int64_t>(num_pts, neighbor_size);
+    all_neighbors_graph(res,
+                        raft::make_const_mdspan(host_query_set.view()),
+                        host_neighbors.view(),
+                        cagra_index.metric());
+
+    // add points to the HNSW index upper layers
+#pragma omp parallel for num_threads(num_threads)
+    for (auto i = start_idx; i < end_idx; i++) {
+      auto pt_id       = order[i];
+      auto internal_id = hnsw_internal_ids[pt_id];
+      auto ll_cur      = appr_algo->get_linklist(internal_id, pt_level);
+      appr_algo->setListCount(ll_cur, host_neighbors.extent(1));
+      auto* data     = (uint32_t*)(ll_cur + 1);
+      auto neighbors = &host_neighbors(i - start_idx, 0);
+      for (auto j = 0; j < host_neighbors.extent(1); j++) {
+        auto neighbor_id          = order[neighbors[j] + start_idx];
+        auto neighbor_internal_id = hnsw_internal_ids[neighbor_id];
+        data[j]                   = neighbor_internal_id;
+      }
+    }
+  }
+
+  // move cagra graph to host or access it from host if available
+  auto host_graph_view = cagra_index.graph();
+  auto host_graph      = raft::make_host_matrix<uint32_t, int64_t>(0, 0);
+  if (!raft::is_host_accessible(raft::memory_type_from_pointer(host_graph_view.data_handle()))) {
+    // copy cagra graph to host
+    host_graph = raft::make_host_matrix<uint32_t, int64_t>(host_graph_view.extent(0),
+                                                           host_graph_view.extent(1));
+    raft::copy(host_graph.data_handle(),
+               host_graph_view.data_handle(),
+               host_graph_view.size(),
+               raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    host_graph_view = host_graph.view();
+  }
+
+// copy cagra graph to hnswlib base layer
+#pragma omp parallel for num_threads(num_threads)
+  for (size_t i = 0; i < static_cast<size_t>(host_graph_view.extent(0)); ++i) {
+    auto ll_i = appr_algo->get_linklist0(hnsw_internal_ids[i]);
+    appr_algo->setListCount(ll_i, host_graph_view.extent(1));
+    auto* data = (uint32_t*)(ll_i + 1);
+    for (size_t j = 0; j < static_cast<size_t>(host_graph_view.extent(1)); ++j) {
+      data[j] = hnsw_internal_ids[host_graph_view(i, j)];
+    }
+  }
+
+  hnsw_index->set_index(std::move(appr_algo));
+  return hnsw_index;
+}
+
 template <typename T>
 std::unique_ptr<index<T>> from_cagra(
   raft::resources const& res,
@@ -217,8 +423,9 @@ std::unique_ptr<index<T>> from_cagra(
     return from_cagra<T, HnswHierarchy::NONE>(res, params, cagra_index, dataset);
   } else if (params.hierarchy == HnswHierarchy::CPU) {
     return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
-  }
-  {
+  } else if (params.hierarchy == HnswHierarchy::GPU) {
+    return from_cagra<T, HnswHierarchy::GPU>(res, params, cagra_index, dataset);
+  } else {
     RAFT_FAIL("Unsupported hierarchy type");
   }
 }
@@ -269,9 +476,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances)
 {
-  RAFT_EXPECTS(
-    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
-    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+  RAFT_EXPECTS(queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+               "Number of rows in output neighbors and distances matrices must equal the number of "
+               "queries.");
 
   RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
                "Number of columns in output neighbors and distances matrices must equal k");

diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -28,6 +28,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
     ctypedef enum cuvsHnswHierarchy:
         NONE
         CPU
+        GPU
 
     ctypedef struct cuvsHnswIndexParams:
         cuvsHnswHierarchy hierarchy
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil: @@
         ctypedef enum cuvsHnswHierarchy:
             NONE
             CPU
+            GPU
         ctypedef struct cuvsHnswIndexParams:
             cuvsHnswHierarchy hierarchy
@@ Expand Down @@