Skip to content

Commit

Permalink
HNSW GPU hierarchy (#616)
Browse files Browse the repository at this point in the history
Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #616
  • Loading branch information
divyegala authored Feb 6, 2025
1 parent bf86cde commit 47f5368
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 30 deletions.
2 changes: 2 additions & 0 deletions cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ void parse_build_param(const nlohmann::json& conf,
param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE;
} else if (conf.at("hierarchy") == "cpu") {
param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
} else if (conf.at("hierarchy") == "gpu") {
param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::GPU;
} else {
THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
}
Expand Down
13 changes: 9 additions & 4 deletions cpp/include/cuvs/neighbors/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,22 @@ enum cuvsHnswHierarchy {
/* Flat hierarchy, search is base-layer only */
NONE,
/* Full hierarchy is built using the CPU */
CPU
CPU,
/* Full hierarchy is built using the GPU */
GPU
};

struct cuvsHnswIndexParams {
/* hierarchy of the hnsw index */
enum cuvsHnswHierarchy hierarchy;
/** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
int ef_construction;
/** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
When the value is 0, the number of threads is automatically determined to the maximum
number of threads available.
/** Number of host threads to use to construct hierarchy when hierarchy is `CPU` or `GPU`.
When the value is 0, the number of threads is automatically determined to the
maximum number of threads available.
NOTE: When hierarchy is `GPU`, while the majority of the work is done on the GPU,
initialization of the HNSW index itself and some other work
is parallelized with the help of CPU threads.
*/
int num_threads;
};
Expand Down
8 changes: 6 additions & 2 deletions cpp/include/cuvs/neighbors/hnsw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,21 @@ namespace cuvs::neighbors::hnsw {
*/
enum class HnswHierarchy {
NONE, // base-layer-only index
CPU // full index with CPU-built hierarchy
CPU, // full index with CPU-built hierarchy
GPU // full index with GPU-built hierarchy
};

struct index_params : cuvs::neighbors::index_params {
/** Hierarchy build type for HNSW index when converting from CAGRA index */
HnswHierarchy hierarchy = HnswHierarchy::NONE;
/** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
int ef_construction = 200;
/** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
/** Number of host threads to use to construct hierarchy when hierarchy is `CPU` or `GPU`.
When the value is 0, the number of threads is automatically determined to the
maximum number of threads available.
NOTE: When hierarchy is `GPU`, while the majority of the work is done on the GPU,
initialization of the HNSW index itself and some other work
is parallelized with the help of CPU threads.
*/
int num_threads = 0;
};
Expand Down
245 changes: 226 additions & 19 deletions cpp/src/neighbors/detail/hnsw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cuvs/neighbors/brute_force.hpp>
#include <cuvs/neighbors/hnsw.hpp>
#include <filesystem>
#include <hnswlib/hnswalg.h>
Expand All @@ -28,6 +29,11 @@

namespace cuvs::neighbors::hnsw::detail {

// This is needed as hnswlib hardcodes the distance type to float
// or int32_t in certain places. However, we can solve uint8 or int8
// natively with the pacth cuVS applies. We could potentially remove
// all the hardcodes and propagate templates throughout hnswlib, but
// as of now it's not needed.
template <typename T>
struct hnsw_dist_t {
using type = void;
Expand Down Expand Up @@ -136,7 +142,6 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
{
// auto host_dataset = raft::make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
Expand Down Expand Up @@ -179,24 +184,29 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
}
appr_algo->base_layer_init = true; // reset to true to allow addition of new points

// move cagra graph to host
auto graph = cagra_index.graph();
auto host_graph =
raft::make_host_matrix<uint32_t, int64_t, raft::row_major>(graph.extent(0), graph.extent(1));
raft::copy(host_graph.data_handle(),
graph.data_handle(),
graph.size(),
raft::resource::get_cuda_stream(res));
raft::resource::sync_stream(res);
// move cagra graph to host or access it from host if available
auto host_graph_view = cagra_index.graph();
auto host_graph = raft::make_host_matrix<uint32_t, int64_t>(0, 0);
if (!raft::is_host_accessible(raft::memory_type_from_pointer(host_graph_view.data_handle()))) {
// copy cagra graph to host
host_graph = raft::make_host_matrix<uint32_t, int64_t>(host_graph_view.extent(0),
host_graph_view.extent(1));
raft::copy(host_graph.data_handle(),
host_graph_view.data_handle(),
host_graph_view.size(),
raft::resource::get_cuda_stream(res));
raft::resource::sync_stream(res);
host_graph_view = host_graph.view();
}

// copy cagra graph to hnswlib base layer
#pragma omp parallel for
for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
#pragma omp parallel for num_threads(num_threads)
for (size_t i = 0; i < static_cast<size_t>(host_graph_view.extent(0)); ++i) {
auto hnsw_internal_id = appr_algo->label_lookup_.find(i)->second;
auto ll_i = appr_algo->get_linklist0(hnsw_internal_id);
appr_algo->setListCount(ll_i, host_graph.extent(1));
appr_algo->setListCount(ll_i, host_graph_view.extent(1));
auto* data = (uint32_t*)(ll_i + 1);
for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
for (size_t j = 0; j < static_cast<size_t>(host_graph_view.extent(1)); ++j) {
auto neighbor_internal_id = appr_algo->label_lookup_.find(host_graph(i, j))->second;
data[j] = neighbor_internal_id;
}
Expand All @@ -206,6 +216,202 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
return hnsw_index;
}

template <typename T, typename DistT>
int initialize_point_in_hnsw(hnswlib::HierarchicalNSW<DistT>* appr_algo,
raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
int64_t real_index,
int curlevel)
{
auto cur_c = appr_algo->cur_element_count++;
appr_algo->element_levels_[cur_c] = curlevel;
memset(appr_algo->data_level0_memory_ + cur_c * appr_algo->size_data_per_element_ +
appr_algo->offsetLevel0_,
0,
appr_algo->size_data_per_element_);

// Initialisation of the data and label
memcpy(appr_algo->getExternalLabeLp(cur_c), &real_index, sizeof(hnswlib::labeltype));
memcpy(appr_algo->getDataByInternalId(cur_c),
dataset.data_handle() + real_index * dataset.extent(1),
appr_algo->data_size_);

if (curlevel) {
appr_algo->linkLists_[cur_c] = (char*)malloc(appr_algo->size_links_per_element_ * curlevel + 1);
if (appr_algo->linkLists_[cur_c] == nullptr)
throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist");
memset(appr_algo->linkLists_[cur_c], 0, appr_algo->size_links_per_element_ * curlevel + 1);
}
return cur_c;
}

template <typename T>
void all_neighbors_graph(raft::resources const& res,
raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
raft::host_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
cuvs::distance::DistanceType metric)
{
nn_descent::index_params nn_params;
nn_params.graph_degree = neighbors.extent(1);
nn_params.intermediate_graph_degree = neighbors.extent(1) * 2;
nn_params.metric = metric;
nn_params.return_distances = false;
auto nn_index = nn_descent::build(res, nn_params, dataset, neighbors);
}

template <typename T, HnswHierarchy hierarchy>
std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> from_cagra(
raft::resources const& res,
const index_params& params,
const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
{
auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
if (dataset.has_value()) {
host_dataset_view = dataset.value();
} else {
// move dataset to host, remove padding
auto cagra_dataset = cagra_index.dataset();
host_dataset =
raft::make_host_matrix<T, int64_t>(cagra_dataset.extent(0), cagra_dataset.extent(1));
RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
sizeof(T) * host_dataset.extent(1),
cagra_dataset.data_handle(),
sizeof(T) * cagra_dataset.stride(0),
sizeof(T) * host_dataset.extent(1),
cagra_dataset.extent(0),
cudaMemcpyDefault,
raft::resource::get_cuda_stream(res)));
raft::resource::sync_stream(res);
host_dataset_view = host_dataset.view();
}

// initialize hnsw index
auto hnsw_index =
std::make_unique<index_impl<T>>(host_dataset_view.extent(1), cagra_index.metric(), hierarchy);
auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
hnsw_index->get_space(),
host_dataset_view.extent(0),
cagra_index.graph().extent(1) / 2,
params.ef_construction);

// assign a level to each point and initialize the points in hnsw
std::vector<size_t> levels(host_dataset_view.extent(0));
std::vector<uint32_t> hnsw_internal_ids(host_dataset_view.extent(0));

auto num_threads = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
#pragma omp parallel for num_threads(num_threads)
for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
levels[i] = appr_algo->getRandomLevel(appr_algo->mult_) + 1;
hnsw_internal_ids[i] =
initialize_point_in_hnsw(appr_algo.get(), host_dataset_view, i, levels[i] - 1);
}

// sort the points by levels
// build histogram
std::vector<size_t> hist;
std::vector<size_t> order(host_dataset_view.extent(0));
for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
auto pt_level = levels[i] - 1;
while (pt_level >= hist.size())
hist.push_back(0);
hist[pt_level]++;
}

// accumulate
std::vector<size_t> offsets(hist.size() + 1, 0);
for (size_t i = 0; i < hist.size() - 1; i++) {
offsets[i + 1] = offsets[i] + hist[i];
}

// bucket sort
for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
auto pt_level = levels[i] - 1;
order[offsets[pt_level]++] = i;
}

// set last point of the highest level as the entry point
appr_algo->enterpoint_node_ = hnsw_internal_ids[order.back()];
appr_algo->maxlevel_ = hist.size() - 1;

// iterate over the points in the descending order of their levels
for (size_t pt_level = hist.size() - 1; pt_level >= 1; pt_level--) {
auto start_idx = offsets[pt_level - 1];
auto end_idx = offsets[hist.size() - 1];
auto num_pts = end_idx - start_idx;
auto neighbor_size = num_pts > appr_algo->M_ ? appr_algo->M_ : num_pts - 1;
if (num_pts <= 1) {
// this means only 1 point in the level
continue;
}

// gather points from dataset to form query set on host
auto host_query_set = raft::make_host_matrix<T, int64_t>(num_pts, host_dataset_view.extent(1));
// TODO: Use `raft::matrix::gather` when available as a public API
// Issue: https://github.com/rapidsai/raft/issues/2572
#pragma omp parallel for num_threads(num_threads)
for (auto i = start_idx; i < end_idx; i++) {
auto pt_id = order[i];
std::copy(&host_dataset_view(pt_id, 0),
&host_dataset_view(pt_id + 1, 0),
&host_query_set(i - start_idx, 0));
}

// find neighbors of the query set
auto host_neighbors = raft::make_host_matrix<uint32_t, int64_t>(num_pts, neighbor_size);
all_neighbors_graph(res,
raft::make_const_mdspan(host_query_set.view()),
host_neighbors.view(),
cagra_index.metric());

// add points to the HNSW index upper layers
#pragma omp parallel for num_threads(num_threads)
for (auto i = start_idx; i < end_idx; i++) {
auto pt_id = order[i];
auto internal_id = hnsw_internal_ids[pt_id];
auto ll_cur = appr_algo->get_linklist(internal_id, pt_level);
appr_algo->setListCount(ll_cur, host_neighbors.extent(1));
auto* data = (uint32_t*)(ll_cur + 1);
auto neighbors = &host_neighbors(i - start_idx, 0);
for (auto j = 0; j < host_neighbors.extent(1); j++) {
auto neighbor_id = order[neighbors[j] + start_idx];
auto neighbor_internal_id = hnsw_internal_ids[neighbor_id];
data[j] = neighbor_internal_id;
}
}
}

// move cagra graph to host or access it from host if available
auto host_graph_view = cagra_index.graph();
auto host_graph = raft::make_host_matrix<uint32_t, int64_t>(0, 0);
if (!raft::is_host_accessible(raft::memory_type_from_pointer(host_graph_view.data_handle()))) {
// copy cagra graph to host
host_graph = raft::make_host_matrix<uint32_t, int64_t>(host_graph_view.extent(0),
host_graph_view.extent(1));
raft::copy(host_graph.data_handle(),
host_graph_view.data_handle(),
host_graph_view.size(),
raft::resource::get_cuda_stream(res));
raft::resource::sync_stream(res);
host_graph_view = host_graph.view();
}

// copy cagra graph to hnswlib base layer
#pragma omp parallel for num_threads(num_threads)
for (size_t i = 0; i < static_cast<size_t>(host_graph_view.extent(0)); ++i) {
auto ll_i = appr_algo->get_linklist0(hnsw_internal_ids[i]);
appr_algo->setListCount(ll_i, host_graph_view.extent(1));
auto* data = (uint32_t*)(ll_i + 1);
for (size_t j = 0; j < static_cast<size_t>(host_graph_view.extent(1)); ++j) {
data[j] = hnsw_internal_ids[host_graph_view(i, j)];
}
}

hnsw_index->set_index(std::move(appr_algo));
return hnsw_index;
}

template <typename T>
std::unique_ptr<index<T>> from_cagra(
raft::resources const& res,
Expand All @@ -217,8 +423,9 @@ std::unique_ptr<index<T>> from_cagra(
return from_cagra<T, HnswHierarchy::NONE>(res, params, cagra_index, dataset);
} else if (params.hierarchy == HnswHierarchy::CPU) {
return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
}
{
} else if (params.hierarchy == HnswHierarchy::GPU) {
return from_cagra<T, HnswHierarchy::GPU>(res, params, cagra_index, dataset);
} else {
RAFT_FAIL("Unsupported hierarchy type");
}
}
Expand Down Expand Up @@ -269,9 +476,9 @@ void search(raft::resources const& res,
raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,
raft::host_matrix_view<float, int64_t, raft::row_major> distances)
{
RAFT_EXPECTS(
queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
"Number of rows in output neighbors and distances matrices must equal the number of queries.");
RAFT_EXPECTS(queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
"Number of rows in output neighbors and distances matrices must equal the number of "
"queries.");

RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
"Number of columns in output neighbors and distances matrices must equal k");
Expand Down
1 change: 1 addition & 0 deletions python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
ctypedef enum cuvsHnswHierarchy:
NONE
CPU
GPU

ctypedef struct cuvsHnswIndexParams:
cuvsHnswHierarchy hierarchy
Expand Down
Loading

0 comments on commit 47f5368

Please sign in to comment.