From 1b62c4117a35b11ce3c830daae248e32ebf75e3f Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 10 Jan 2025 22:29:03 +0100
Subject: [PATCH 1/2] Fix lanczos solver integer overflow (#2536)

Partially answers https://github.com/rapidsai/cuml/issues/6204

Authors:
  - Victor Lafargue (https://github.com/viclafargue)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/raft/pull/2536
---
 cpp/include/raft/sparse/detail/coo.cuh        | 14 ++++++++-----
 .../raft/sparse/solver/detail/lanczos.cuh     | 21 +++++++++++--------
 .../raft/spectral/detail/matrix_wrappers.hpp  |  8 +++----
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index 91ba363168..9a38c11a07 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -182,7 +182,7 @@ class COO {
    * @param n_rows: number of rows in the dense matrix
    * @param n_cols: number of columns in the dense matrix
    */
-  void setSize(int n_rows, int n_cols)
+  void setSize(Index_Type n_rows, Index_Type n_cols)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
@@ -192,7 +192,7 @@ class COO {
    * @brief Set the number of rows and cols for a square dense matrix
    * @param n: number of rows and cols
    */
-  void setSize(int n)
+  void setSize(Index_Type n)
   {
     this->n_rows = n;
     this->n_cols = n;
@@ -204,7 +204,10 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
+  void allocate(Index_Type nnz, bool init, cudaStream_t stream)
+  {
+    this->allocate(nnz, 0, init, stream);
+  }
 
   /**
    * @brief Allocate the underlying arrays
@@ -213,7 +216,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream)
+  void allocate(Index_Type nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -226,7 +229,8 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
+  void allocate(
+    Index_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 6f03f77bc0..ddfa01731a 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -624,7 +624,7 @@ static int lanczosRestart(raft::resources const& handle,
   value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t* V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + (size_t)n * (size_t)iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -641,7 +641,7 @@ static int lanczosRestart(raft::resources const& handle,
   // std::cout <<std::endl;
 
   // Initialize similarity transform with identity matrix
-  memset(V_host, 0, iter * iter * sizeof(value_type_t));
+  memset(V_host, 0, (size_t)iter * (size_t)iter * (size_t)sizeof(value_type_t));
   for (i = 0; i < iter; ++i)
     V_host[IDX(i, i, iter)] = 1;
 
@@ -679,8 +679,11 @@ static int lanczosRestart(raft::resources const& handle,
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(V_dev,
+                                V_host,
+                                (size_t)iter * (size_t)iter * (size_t)sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
 
   beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
   RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_h,
@@ -716,7 +719,7 @@ static int lanczosRestart(raft::resources const& handle,
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
                                 work_dev,
-                                n * iter_new * sizeof(value_type_t),
+                                (size_t)n * (size_t)iter_new * (size_t)sizeof(value_type_t),
                                 cudaMemcpyDeviceToDevice,
                                 stream));
 
@@ -1045,10 +1048,10 @@ int computeSmallestEigenvectors(
   unsigned long long seed = 1234567)
 {
   // Matrix dimension
-  index_type_t n = A.nrows_;
+  size_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && (size_t)nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1395,10 +1398,10 @@ int computeLargestEigenvectors(
   unsigned long long seed = 123456)
 {
   // Matrix dimension
-  index_type_t n = A.nrows_;
+  size_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && (size_t)nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 1fe078bd32..db8a5dc9ef 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -39,14 +39,14 @@
 // =========================================================
 
 // Get index of matrix entry
-#define IDX(i, j, lda) ((i) + (j) * (lda))
+#define IDX(i, j, lda) ((size_t)(i) + (j) * (lda))
 
 namespace raft {
 namespace spectral {
 namespace matrix {
 namespace detail {
 
-using size_type = int;  // for now; TODO: move it in appropriate header
+using size_type = size_t;  // for now; TODO: move it in appropriate header
 
 // Apply diagonal matrix to vector:
 //
@@ -326,7 +326,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
         raft_handle, row_offsets, col_indices, values, nrows, nnz),
       diagonal_(raft_handle, nrows)
   {
-    vector_t<value_type> ones{raft_handle, nrows};
+    vector_t<value_type> ones{raft_handle, (size_t)nrows};
     ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
@@ -341,7 +341,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
                                               csr_m.nnz_),
       diagonal_(raft_handle, csr_m.nrows_)
   {
-    vector_t<value_type> ones{raft_handle, csr_m.nrows_};
+    vector_t<value_type> ones{raft_handle, (size_t)csr_m.nrows_};
     ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }

From 5c826d7320486852c30a18f6e039d0cda83c5c62 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Tue, 14 Jan 2025 00:34:20 +0100
Subject: [PATCH 2/2] Add support for different data type of bitset (#2535)

This PR is useful for Milvus.
Previously the `bitset_view` object only supported the data type used to create the bitset. With the proposed modifications, a `bitset_view` object can support any data type used to create the bitset by specifying the `original_nbits` parameter in the class constructor.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - rhdong (https://github.com/rhdong)

URL: https://github.com/rapidsai/raft/pull/2535
---
 cpp/include/raft/core/bitmap.hpp | 24 ++++++--
 cpp/include/raft/core/bitset.cuh | 53 ++++++++++++++---
 cpp/include/raft/core/bitset.hpp | 34 +++++++++--
 cpp/test/core/bitset.cu          | 98 ++++++++++++++++++++++++++++++--
 4 files changed, 188 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
index 86b2d77478..5a6656f572 100644
--- a/cpp/include/raft/core/bitmap.hpp
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -53,9 +53,18 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    * @param bitmap_ptr Device raw pointer
    * @param rows Number of row in the matrix.
    * @param cols Number of col in the matrix.
+   * @param original_nbits Original number of bits used when the bitmap was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitmap was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr,
+                                index_t rows,
+                                index_t cols,
+                                index_t original_nbits = 0)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols, original_nbits),
+      rows_(rows),
+      cols_(cols)
   {
   }
 
@@ -65,11 +74,18 @@ struct bitmap_view : public bitset_view<bitmap_t, index_t> {
    * @param bitmap_span Device vector view of the bitmap
    * @param rows Number of row in the matrix.
    * @param cols Number of col in the matrix.
+   * @param original_nbits Original number of bits used when the bitmap was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitmap was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
   _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
                                 index_t rows,
-                                index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
+                                index_t cols,
+                                index_t original_nbits = 0)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols, original_nbits),
+      rows_(rows),
+      cols_(cols)
   {
   }
 
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index d1bffdb81e..feaef1a172 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -32,12 +32,41 @@
 
 namespace raft::core {
 
+template <typename index_t>
+_RAFT_HOST_DEVICE void inline compute_original_nbits_position(const index_t original_nbits,
+                                                              const index_t nbits,
+                                                              const index_t sample_index,
+                                                              index_t& new_bit_index,
+                                                              index_t& new_bit_offset)
+{
+  const index_t original_bit_index  = sample_index / original_nbits;
+  const index_t original_bit_offset = sample_index % original_nbits;
+  new_bit_index                     = original_bit_index * original_nbits / nbits;
+  new_bit_offset                    = 0;
+  if (original_nbits > nbits) {
+    new_bit_index += original_bit_offset / nbits;
+    new_bit_offset = original_bit_offset % nbits;
+  } else {
+    index_t ratio = nbits / original_nbits;
+    new_bit_offset += (original_bit_index % ratio) * original_nbits;
+    new_bit_offset += original_bit_offset % nbits;
+  }
+}
+
 template <typename bitset_t, typename index_t>
 _RAFT_HOST_DEVICE inline bool bitset_view<bitset_t, index_t>::test(const index_t sample_index) const
 {
-  const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
-  const index_t bit_index    = sample_index % bitset_element_size;
-  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+  const index_t nbits = sizeof(bitset_t) * 8;
+  index_t bit_index   = 0;
+  index_t bit_offset  = 0;
+  if (original_nbits_ == 0 || nbits == original_nbits_) {
+    bit_index  = sample_index / bitset_element_size;
+    bit_offset = sample_index % bitset_element_size;
+  } else {
+    compute_original_nbits_position(original_nbits_, nbits, sample_index, bit_index, bit_offset);
+  }
+  const bitset_t bit_element = bitset_ptr_[bit_index];
+  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_offset)) != 0;
   return is_bit_set;
 }
 
@@ -51,14 +80,22 @@ template <typename bitset_t, typename index_t>
 _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
                                                       bool set_value) const
 {
-  const index_t bit_element = sample_index / bitset_element_size;
-  const index_t bit_index   = sample_index % bitset_element_size;
-  const bitset_t bitmask    = bitset_t{1} << bit_index;
+  const index_t nbits = sizeof(bitset_t) * 8;
+  index_t bit_index   = 0;
+  index_t bit_offset  = 0;
+
+  if (original_nbits_ == 0 || nbits == original_nbits_) {
+    bit_index  = sample_index / bitset_element_size;
+    bit_offset = sample_index % bitset_element_size;
+  } else {
+    compute_original_nbits_position(original_nbits_, nbits, sample_index, bit_index, bit_offset);
+  }
+  const bitset_t bitmask = bitset_t{1} << bit_offset;
   if (set_value) {
-    atomicOr(bitset_ptr_ + bit_element, bitmask);
+    atomicOr(bitset_ptr_ + bit_index, bitmask);
   } else {
     const bitset_t bitmask2 = ~bitmask;
-    atomicAnd(bitset_ptr_ + bit_element, bitmask2);
+    atomicAnd(bitset_ptr_ + bit_index, bitmask2);
   }
 }
 
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index be828def87..e4bea2c0c5 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -42,8 +42,20 @@ template <typename bitset_t = uint32_t, typename index_t = uint32_t>
 struct bitset_view {
   static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
 
-  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
-    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  /**
+   * @brief Create a bitset view from a device pointer to the bitset.
+   *
+   * @param bitset_ptr Device pointer to the bitset
+   * @param bitset_len Number of bits in the bitset
+   * @param original_nbits Original number of bits used when the bitset was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitset was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
+   */
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr,
+                                index_t bitset_len,
+                                index_t original_nbits = 0)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}, original_nbits_{original_nbits}
   {
   }
   /**
@@ -51,10 +63,17 @@ struct bitset_view {
    *
    * @param bitset_span Device vector view of the bitset
    * @param bitset_len Number of bits in the bitset
+   * @param original_nbits Original number of bits used when the bitset was created, to handle
+   * potential mismatches of data types. This is useful for using ANN indexes when a bitset was
+   * originally created with a different data type than the ones currently supported in cuVS ANN
+   * indexes.
    */
   _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
-                                index_t bitset_len)
-    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+                                index_t bitset_len,
+                                index_t original_nbits = 0)
+    : bitset_ptr_{bitset_span.data_handle()},
+      bitset_len_{bitset_len},
+      original_nbits_{original_nbits}
   {
   }
   /**
@@ -180,9 +199,16 @@ struct bitset_view {
     return (bitset_len + bits_per_element - 1) / bits_per_element;
   }
 
+  /**
+   * @brief Get the original number of bits of the bitset.
+   */
+  auto get_original_nbits() const -> index_t { return original_nbits_; }
+  void set_original_nbits(index_t original_nbits) { original_nbits_ = original_nbits; }
+
  private:
   bitset_t* bitset_ptr_;
   index_t bitset_len_;
+  index_t original_nbits_;
 };
 
 /**
diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu
index ac601274c1..f094f60ded 100644
--- a/cpp/test/core/bitset.cu
+++ b/cpp/test/core/bitset.cu
@@ -24,6 +24,8 @@
 #include <gtest/gtest.h>
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdlib>
 #include <numeric>
 
 namespace raft::core {
@@ -73,6 +75,40 @@ void test_cpu_bitset(const std::vector<bitset_t>& bitset,
   }
 }
 
+template <typename bitset_t, typename index_t>
+void test_cpu_bitset_nbits(const bitset_t* bitset,
+                           const std::vector<index_t>& queries,
+                           std::vector<uint8_t>& result,
+                           unsigned original_nbits_)
+{
+  constexpr size_t nbits = sizeof(bitset_t) * 8;
+  if (original_nbits_ == nbits) {
+    for (size_t i = 0; i < queries.size(); i++) {
+      result[i] =
+        uint8_t((bitset[queries[i] / nbits] & (bitset_t{1} << (queries[i] % nbits))) != 0);
+    }
+  }
+  for (size_t i = 0; i < queries.size(); i++) {
+    const index_t sample_index        = queries[i];
+    const index_t original_bit_index  = sample_index / original_nbits_;
+    const index_t original_bit_offset = sample_index % original_nbits_;
+    index_t new_bit_index             = original_bit_index * original_nbits_ / nbits;
+    index_t new_bit_offset            = 0;
+    if (original_nbits_ > nbits) {
+      new_bit_index += original_bit_offset / nbits;
+      new_bit_offset = original_bit_offset % nbits;
+    } else {
+      index_t ratio = nbits / original_nbits_;
+      new_bit_offset += (original_bit_index % ratio) * original_nbits_;
+      new_bit_offset += original_bit_offset % nbits;
+    }
+    const bitset_t bit_element = bitset[new_bit_index];
+    const bool is_bit_set      = (bit_element & (bitset_t{1} << new_bit_offset)) != 0;
+
+    result[i] = uint8_t(is_bit_set);
+  }
+}
+
 template <typename bitset_t>
 void flip_cpu_bitset(std::vector<bitset_t>& bitset)
 {
@@ -168,11 +204,12 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
-    auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
-    auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
-    auto query_cpu     = std::vector<index_t>(spec.query_len);
-    auto result_cpu    = std::vector<uint8_t>(spec.query_len);
-    auto result_ref    = std::vector<uint8_t>(spec.query_len);
+    auto query_device     = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
+    auto result_device    = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
+    auto query_cpu        = std::vector<index_t>(spec.query_len);
+    auto result_cpu       = std::vector<uint8_t>(spec.query_len);
+    auto result_ref_nbits = std::vector<uint8_t>(spec.query_len);
+    auto result_ref       = std::vector<uint8_t>(spec.query_len);
 
     // Create queries and verify the test results
     raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
@@ -194,6 +231,57 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
+    // Reinterpret the bitset as uint8_t, uint32 then uint64_t
+    {
+      // Test CPU logic
+      test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+      uint8_t* bitset_cpu_uint8 = (uint8_t*)std::malloc(sizeof(bitset_t) * bitset_ref.size());
+      std::memcpy(bitset_cpu_uint8, bitset_ref.data(), sizeof(bitset_t) * bitset_ref.size());
+      test_cpu_bitset_nbits(bitset_cpu_uint8, query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
+      std::free(bitset_cpu_uint8);
+
+      // Test GPU uint8_t, uint32_t, uint64_t
+      auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
+        reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint8_t] __device__(index_t query) {
+          return my_bitset_view_uint8_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+
+      auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
+        reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint32_t] __device__(index_t query) {
+          return my_bitset_view_uint32_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+
+      auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
+        reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+      raft::linalg::map(
+        res,
+        result_device.view(),
+        [my_bitset_view_uint64_t] __device__(index_t query) {
+          return my_bitset_view_uint64_t.test(query);
+        },
+        raft::make_const_mdspan(query_device.view()));
+      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
+      resource::sync_stream(res, stream);
+      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+    }
+
     // test sparsity, repeat and eval_n_elements
     {
       auto my_bitset_view  = my_bitset.view();