diff --git a/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py b/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
index dc961223cb1..a22938dcdbe 100644
--- a/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph-dgl/pytest-based/bench_cugraph_dgl_uniform_neighbor_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,10 +22,6 @@
 import numpy as np
 import cupy as cp
 
-# Facing issues with rapids-pytest-benchmark plugin
-# pytest-benchmark.
-import pytest_benchmark
-
 from cugraph.generators import rmat
 from cugraph.experimental import datasets
 from cugraph_benchmarking import params
diff --git a/benchmarks/cugraph-service/pytest-based/bench_cgs_uniform_neighbor_sample.py b/benchmarks/cugraph-service/pytest-based/bench_cgs_uniform_neighbor_sample.py
index 28849a34c0d..e24fdfc5bac 100644
--- a/benchmarks/cugraph-service/pytest-based/bench_cgs_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph-service/pytest-based/bench_cgs_uniform_neighbor_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,18 +18,6 @@
 import pytest
 import numpy as np
 
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
 from cugraph_service_client import CugraphServiceClient
 from cugraph_service_client.exceptions import CugraphServiceError
 from cugraph_service_client import RemoteGraph
@@ -178,7 +166,7 @@ def remote_graph_objs(request):
     "with_replacement", [False], ids=lambda v: f"with_replacement={v}"
 )
 def bench_cgs_uniform_neighbor_sample(
-    gpubenchmark, remote_graph_objs, batch_size, fanout, with_replacement
+    benchmark, remote_graph_objs, batch_size, fanout, with_replacement
 ):
     (G, num_verts, uniform_neighbor_sample_func) = remote_graph_objs
 
@@ -188,7 +176,7 @@ def bench_cgs_uniform_neighbor_sample(
         )
     # print(f"\n{uns_args}")
     # FIXME: uniform_neighbor_sample cannot take a np.ndarray for start_list
-    result = gpubenchmark(
+    result = benchmark(
         uniform_neighbor_sample_func,
         G,
         start_list=uns_args["start_list"],
diff --git a/benchmarks/cugraph/pytest-based/README.md b/benchmarks/cugraph/pytest-based/README.md
index fe4acfd4de0..be765d8f6d4 100644
--- a/benchmarks/cugraph/pytest-based/README.md
+++ b/benchmarks/cugraph/pytest-based/README.md
@@ -13,9 +13,6 @@ directory under the root of the `cuGraph` source tree.
 * cugraph built and installed (or `cugraph` sources and built C++ extensions
   available on `PYTHONPATH`)
 
-* rapids-pytest-benchmark pytest plugin (`conda install -c rapidsai
-  rapids-pytest-benchmark`)
-
 * The benchmark datasets downloaded and installed in <cugraph>/datasets. Run the
 script below from the <cugraph>/datasets directory:
 ```
@@ -25,8 +22,7 @@ cd <cugraph>/datasets
 
 ## Usage (Python)
 ### Python
-* Run `pytest --help` (with the rapids-pytest-benchmark plugin installed) for
-  the full list of options
+* Run `pytest --help` for the full list of options
 
 * See also the `pytest.ini` file in this directory for examples of how to enable
   options by default and define marks
@@ -44,9 +40,9 @@ _**NOTE: these commands must be run from the `<cugraph_root>/benchmarks` directo
 (rapids) user@machine:/cugraph/benchmarks> pytest -x
 ```
 
-* Run all the benchmarks but do not reinit RMM with different configurations
+* Run all the benchmarks and allow RMM to reinit with different configurations
 ```
-(rapids) user@machine:/cugraph/benchmarks> pytest --no-rmm-reinit
+(rapids) user@machine:/cugraph/benchmarks> pytest --allow-rmm-reinit
 ```
 
 * Show what benchmarks would be run with the given options, but do not run them
diff --git a/benchmarks/cugraph/pytest-based/bench_algos.py b/benchmarks/cugraph/pytest-based/bench_algos.py
index 1c988ea636a..58a5d0a08b5 100644
--- a/benchmarks/cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/cugraph/pytest-based/bench_algos.py
@@ -13,25 +13,6 @@
 
 import pytest
 import numpy as np
-import pytest_benchmark
-
-# FIXME: Remove this when rapids_pytest_benchmark.gpubenchmark is available
-# everywhere
-try:
-    from rapids_pytest_benchmark import setFixtureParamNames
-except ImportError:
-    print(
-        "\n\nWARNING: rapids_pytest_benchmark is not installed, "
-        "falling back to pytest_benchmark fixtures.\n"
-    )
-
-    # if rapids_pytest_benchmark is not available, just perfrom time-only
-    # benchmarking and replace the util functions with nops
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
-    def setFixtureParamNames(*args, **kwargs):
-        pass
-
 
 import rmm
 import dask_cudf
@@ -50,6 +31,7 @@ def setFixtureParamNames(*args, **kwargs):
     pool_allocator,
 )
 
+
 # duck-type compatible Dataset for RMAT data
 class RmatDataset:
     def __init__(self, scale=4, edgefactor=2, mg=False):
@@ -198,11 +180,6 @@ def reinitRMM(managed_mem, pool_alloc):
 
 @pytest.fixture(scope="module", params=rmm_fixture_params)
 def rmm_config(request):
-    # Since parameterized fixtures do not assign param names to param values,
-    # manually call the helper to do so. Ensure the order of the name list
-    # passed to it matches if there are >1 params.
-    # If the request only contains n params, only the first n names are set.
-    setFixtureParamNames(request, ["managed_mem", "pool_allocator"])
     reinitRMM(request.param[0], request.param[1])
 
 
@@ -215,7 +192,6 @@ def dataset(request, rmm_config):
     tests/fixtures are done with the Dataset, it has the Dask cluster and
     client torn down (if MG) and all data loaded is freed.
     """
-    setFixtureParamNames(request, ["dataset"])
     dataset = request.param[0]
     client = cluster = None
     # For now, only RmatDataset instanaces support MG and have a "mg" attr.
@@ -283,8 +259,8 @@ def get_vertex_pairs(G, num_vertices=10):
 
 ###############################################################################
 # Benchmarks
-def bench_create_graph(gpubenchmark, edgelist):
-    gpubenchmark(
+def bench_create_graph(benchmark, edgelist):
+    benchmark(
         cugraph.from_cudf_edgelist,
         edgelist,
         source="src",
@@ -298,8 +274,8 @@ def bench_create_graph(gpubenchmark, edgelist):
 # results in thousands of rounds before the default threshold is met, so lower
 # the max_time for this benchmark.
 @pytest.mark.benchmark(warmup=True, warmup_iterations=10, max_time=0.005)
-def bench_create_digraph(gpubenchmark, edgelist):
-    gpubenchmark(
+def bench_create_digraph(benchmark, edgelist):
+    benchmark(
         cugraph.from_cudf_edgelist,
         edgelist,
         source="src",
@@ -309,26 +285,26 @@ def bench_create_digraph(gpubenchmark, edgelist):
     )
 
 
-def bench_renumber(gpubenchmark, edgelist):
-    gpubenchmark(NumberMap.renumber, edgelist, "src", "dst")
+def bench_renumber(benchmark, edgelist):
+    benchmark(NumberMap.renumber, edgelist, "src", "dst")
 
 
-def bench_pagerank(gpubenchmark, transposed_graph):
+def bench_pagerank(benchmark, transposed_graph):
     pagerank = (
         dask_cugraph.pagerank
         if is_graph_distributed(transposed_graph)
         else cugraph.pagerank
     )
-    gpubenchmark(pagerank, transposed_graph)
+    benchmark(pagerank, transposed_graph)
 
 
-def bench_bfs(gpubenchmark, graph):
+def bench_bfs(benchmark, graph):
     bfs = dask_cugraph.bfs if is_graph_distributed(graph) else cugraph.bfs
     start = graph.edgelist.edgelist_df["src"][0]
-    gpubenchmark(bfs, graph, start)
+    benchmark(bfs, graph, start)
 
 
-def bench_sssp(gpubenchmark, graph):
+def bench_sssp(benchmark, graph):
     if not graph.is_weighted():
         pytest.skip("Skipping: Unweighted Graphs are not supported by SSSP")
 
@@ -340,102 +316,102 @@ def bench_sssp(gpubenchmark, graph):
 
     start = start_col.to_arrow().to_pylist()[0]
 
-    gpubenchmark(sssp, graph, start)
+    benchmark(sssp, graph, start)
 
 
-def bench_jaccard(gpubenchmark, unweighted_graph):
+def bench_jaccard(benchmark, unweighted_graph):
     G = unweighted_graph
     # algo cannot compute neighbors on all nodes without running into OOM
     # this is why we will call jaccard on a subset of nodes
     vert_pairs = get_vertex_pairs(G)
     jaccard = dask_cugraph.jaccard if is_graph_distributed(G) else cugraph.jaccard
-    gpubenchmark(jaccard, G, vert_pairs)
+    benchmark(jaccard, G, vert_pairs)
 
 
-def bench_sorensen(gpubenchmark, unweighted_graph):
+def bench_sorensen(benchmark, unweighted_graph):
     G = unweighted_graph
     # algo cannot compute neighbors on all nodes without running into OOM
     # this is why we will call sorensen on a subset of nodes
     vert_pairs = get_vertex_pairs(G)
     sorensen = dask_cugraph.sorensen if is_graph_distributed(G) else cugraph.sorensen
-    gpubenchmark(sorensen, G, vert_pairs)
+    benchmark(sorensen, G, vert_pairs)
 
 
-def bench_louvain(gpubenchmark, graph):
+def bench_louvain(benchmark, graph):
     louvain = dask_cugraph.louvain if is_graph_distributed(graph) else cugraph.louvain
-    gpubenchmark(louvain, graph)
+    benchmark(louvain, graph)
 
 
-def bench_weakly_connected_components(gpubenchmark, graph):
+def bench_weakly_connected_components(benchmark, graph):
     if is_graph_distributed(graph):
         pytest.skip("distributed graphs are not supported")
     if graph.is_directed():
         G = graph.to_undirected()
     else:
         G = graph
-    gpubenchmark(cugraph.weakly_connected_components, G)
+    benchmark(cugraph.weakly_connected_components, G)
 
 
-def bench_overlap(gpubenchmark, unweighted_graph):
+def bench_overlap(benchmark, unweighted_graph):
     G = unweighted_graph
     # algo cannot compute neighbors on all nodes without running into OOM
     # this is why we will call sorensen on a subset of nodes
     vertex_pairs = get_vertex_pairs(G)
     overlap = dask_cugraph.overlap if is_graph_distributed(G) else cugraph.overlap
-    gpubenchmark(overlap, G, vertex_pairs)
+    benchmark(overlap, G, vertex_pairs)
 
 
-def bench_triangle_count(gpubenchmark, graph):
+def bench_triangle_count(benchmark, graph):
     tc = (
         dask_cugraph.triangle_count
         if is_graph_distributed(graph)
         else cugraph.triangle_count
     )
-    gpubenchmark(tc, graph)
+    benchmark(tc, graph)
 
 
-def bench_spectralBalancedCutClustering(gpubenchmark, graph):
+def bench_spectralBalancedCutClustering(benchmark, graph):
     if is_graph_distributed(graph):
         pytest.skip("distributed graphs are not supported")
-    gpubenchmark(cugraph.spectralBalancedCutClustering, graph, 2)
+    benchmark(cugraph.spectralBalancedCutClustering, graph, 2)
 
 
 @pytest.mark.skip(reason="Need to guarantee graph has weights, " "not doing that yet")
-def bench_spectralModularityMaximizationClustering(gpubenchmark, graph):
+def bench_spectralModularityMaximizationClustering(benchmark, graph):
     smmc = (
         dask_cugraph.spectralModularityMaximizationClustering
         if is_graph_distributed(graph)
         else cugraph.spectralModularityMaximizationClustering
     )
-    gpubenchmark(smmc, graph, 2)
+    benchmark(smmc, graph, 2)
 
 
-def bench_graph_degree(gpubenchmark, graph):
-    gpubenchmark(graph.degree)
+def bench_graph_degree(benchmark, graph):
+    benchmark(graph.degree)
 
 
-def bench_graph_degrees(gpubenchmark, graph):
+def bench_graph_degrees(benchmark, graph):
     if is_graph_distributed(graph):
         pytest.skip("distributed graphs are not supported")
-    gpubenchmark(graph.degrees)
+    benchmark(graph.degrees)
 
 
-def bench_betweenness_centrality(gpubenchmark, graph):
+def bench_betweenness_centrality(benchmark, graph):
     bc = (
         dask_cugraph.betweenness_centrality
         if is_graph_distributed(graph)
         else cugraph.betweenness_centrality
     )
-    gpubenchmark(bc, graph, k=10, random_state=123)
+    benchmark(bc, graph, k=10, random_state=123)
 
 
-def bench_edge_betweenness_centrality(gpubenchmark, graph):
+def bench_edge_betweenness_centrality(benchmark, graph):
     if is_graph_distributed(graph):
         pytest.skip("distributed graphs are not supported")
-    gpubenchmark(cugraph.edge_betweenness_centrality, graph, k=10, seed=123)
+    benchmark(cugraph.edge_betweenness_centrality, graph, k=10, seed=123)
 
 
-def bench_uniform_neighbor_sample(gpubenchmark, graph):
+def bench_uniform_neighbor_sample(benchmark, graph):
     uns = (
         dask_cugraph.uniform_neighbor_sample
         if is_graph_distributed(graph)
@@ -455,13 +431,13 @@ def bench_uniform_neighbor_sample(gpubenchmark, graph):
         start_list = start_list.compute()
 
     fanout_vals = [5, 5, 5]
-    gpubenchmark(uns, graph, start_list=start_list, fanout_vals=fanout_vals)
+    benchmark(uns, graph, start_list=start_list, fanout_vals=fanout_vals)
 
 
-def bench_egonet(gpubenchmark, graph):
+def bench_egonet(benchmark, graph):
     egonet = (
         dask_cugraph.ego_graph if is_graph_distributed(graph) else cugraph.ego_graph
     )
     n = 1
     radius = 2
-    gpubenchmark(egonet, graph, n, radius=radius)
+    benchmark(egonet, graph, n, radius=radius)
diff --git a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
index 083acdde2f4..12d3046ddbc 100644
--- a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,18 +22,6 @@
 import dask_cudf
 import rmm
 
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
 from cugraph import (
     MultiGraph,
     uniform_neighbor_sample,
@@ -271,7 +259,7 @@ def uns_func(*args, **kwargs):
     "with_replacement", [False], ids=lambda v: f"with_replacement={v}"
 )
 def bench_cugraph_uniform_neighbor_sample(
-    gpubenchmark, graph_objs, batch_size, fanout, with_replacement
+    benchmark, graph_objs, batch_size, fanout, with_replacement
 ):
     (G, num_verts, uniform_neighbor_sample_func) = graph_objs
 
@@ -281,7 +269,7 @@ def bench_cugraph_uniform_neighbor_sample(
         )
     # print(f"\n{uns_args}")
     # FIXME: uniform_neighbor_sample cannot take a np.ndarray for start_list
-    result = gpubenchmark(
+    result = benchmark(
         uniform_neighbor_sample_func,
         G,
         start_list=uns_args["start_list"],
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
index 2604642b748..66c34cc1276 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,9 +36,7 @@
 def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None:
     import cupy
     import rmm
-    from pynvml.smi import nvidia_smi
 
-    smi = nvidia_smi.getInstance()
     pool_size = 16e9  # FIXME calculate this
 
     rmm.reinitialize(
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
index d6205901b68..72a530fc6fe 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -201,10 +201,6 @@ def train(self):
                         )
                         logger.info(f"total time: {total_time_iter}")
 
-                        # from pynvml.smi import nvidia_smi
-                        # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage']
-                        # logger.info(f"rank {self.rank} memory: {mem_info}")
-
                     y_true = data.y
                     y_true = y_true.reshape((y_true.shape[0],))
                     x = data.x.to(torch.float32)
diff --git a/benchmarks/dgl/pytest-based/dgl_benchmark.py b/benchmarks/dgl/pytest-based/dgl_benchmark.py
index 456fa8fedc6..6b4585c84ac 100644
--- a/benchmarks/dgl/pytest-based/dgl_benchmark.py
+++ b/benchmarks/dgl/pytest-based/dgl_benchmark.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,7 +13,6 @@
 
 import pandas as pd
 import os
-import pytest_benchmark
 import pytest
 import torch
 import dgl
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 5a0a835c617..a2afb161ebc 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -51,9 +51,6 @@
 /** @defgroup traversal_cpp C++ traversal algorithms
  */
 
-/** @defgroup labeling_cpp C++ labeling algorithms
- */
-
 /** @defgroup linear_cpp C++ linear assignment algorithms
  */
 
@@ -63,7 +60,7 @@
 /** @defgroup layout_cpp C++ layout algorithms
  */
 
-/** @defgroup component_cpp C++ component algorithms
+/** @defgroup components_cpp C++ component algorithms
  */
 
 /** @defgroup tree_cpp C++ tree algorithms
@@ -127,7 +124,7 @@ void jaccard_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
                   WT* result);
 
 /**
-.* @ingroup similarity_cpp
+ * @ingroup similarity_cpp
  * @brief     Compute overlap coefficient for all vertices in the graph
  *
  * Computes the Overlap Coefficient for every pair of vertices in the graph which are
diff --git a/cpp/include/cugraph/detail/collect_comm_wrapper.hpp b/cpp/include/cugraph/detail/collect_comm_wrapper.hpp
index e547bdb3552..f2307705ac0 100644
--- a/cpp/include/cugraph/detail/collect_comm_wrapper.hpp
+++ b/cpp/include/cugraph/detail/collect_comm_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,13 @@
 namespace cugraph {
 namespace detail {
 
+/** @defgroup collect_comm_wrapper_cpp C++ Shuffle Wrappers
+ */
+
+/** @ingroup collect_comm_wrapper_cpp
+ *  @{
+ */
+
 /**
  * @brief Gather the span of data from all ranks and broadcast the combined data to all ranks.
  *
@@ -42,3 +49,6 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
 
 }  // namespace detail
 }  // namespace cugraph
+/**
+ * @}
+ */
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index 7dffcce298a..e0d8e7f0275 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,13 @@
 namespace cugraph {
 namespace detail {
 
+/** @defgroup shuffle_wrappers_cpp C++ Shuffle Wrappers
+ */
+
+/** @ingroup shuffle_wrappers_cpp
+ *  @{
+ */
+
 /**
  * @brief Shuffle external (i.e. before renumbering) vertex pairs (which can be edge end points) to
  * their local GPUs based on edge partitioning.
@@ -276,3 +283,7 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
 
 }  // namespace detail
 }  // namespace cugraph
+
+/**
+ * @}
+ */
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 6a03b9a6454..e85959e164a 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,9 @@
 #include <tuple>
 #include <vector>
 
+/** @defgroup graph_functions_cpp C++ Graph Funtions
+ */
+
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, bool multi_gpu, typename Enable = void>
@@ -51,6 +54,7 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>
 };
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief renumber edgelist (multi-GPU)
  *
  * This function assumes that vertices are pre-shuffled to their target processes and edges are
@@ -113,6 +117,7 @@ renumber_edgelist(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief renumber edgelist (single-GPU)
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -151,6 +156,7 @@ renumber_edgelist(raft::handle_t const& handle,
                   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Renumber external vertices to internal vertices based on the provided @p
  * renumber_map_labels.
  *
@@ -182,6 +188,7 @@ void renumber_ext_vertices(raft::handle_t const& handle,
                            bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Unrenumber local internal vertices to external vertices based on the providied @p
  * renumber_map_labels.
  *
@@ -213,6 +220,7 @@ void unrenumber_local_int_vertices(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the
  * providied @p renumber_map_labels.
  *
@@ -241,6 +249,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle,
                              bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Unrenumber local edges' internal source & destination IDs to external IDs based on the
  * provided @p renumber_map_labels (multi-GPU).
  *
@@ -281,6 +290,7 @@ std::enable_if_t<multi_gpu, void> unrenumber_local_int_edges(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Unrenumber local edges' internal source & destination IDs to external IDs based on the
  * provided @p renumber_map_labels (single-GPU).
  *
@@ -311,6 +321,7 @@ std::enable_if_t<!multi_gpu, void> unrenumber_local_int_edges(raft::handle_t con
                                                               bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Renumber local external vertices to internal vertices based on the provided @p
  * renumber_map_labels.
  *
@@ -341,6 +352,7 @@ void renumber_local_ext_vertices(raft::handle_t const& handle,
                                  vertex_t local_int_vertex_last,
                                  bool do_expensive_check = false);
 /**
+ * @ingroup graph_functions_cpp
  * @brief Construct the edge list from the graph view object.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -385,6 +397,7 @@ decompress_to_edgelist(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Symmetrize edgelist.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -417,6 +430,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
                     bool reciprocal);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Symmetrize the input graph.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -459,6 +473,7 @@ symmetrize_graph(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Transpose the input graph.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -498,6 +513,7 @@ transpose_graph(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Transpose the storage format (no change in an actual graph topology).
  *
  * In SG, convert between CSR and CSC. In multi-GPU, currently convert between CSR + DCSR hybrid
@@ -541,6 +557,7 @@ transpose_graph_storage(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Compute the coarsened graph.
  *
  * Aggregates the vertices with the same label to a new vertex in the output coarsened graph.
@@ -588,6 +605,7 @@ coarsen_graph(raft::handle_t const& handle,
               bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Relabel old labels to new labels.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -621,6 +639,7 @@ void relabel(raft::handle_t const& handle,
 // FIXME: the first two elements of the returned tuple should be source & destination instead of
 // major & minor. Major & minor shouldn't be used in the non-detail public API.
 /**
+ * @ingroup graph_functions_cpp
  * @brief extract induced subgraph(s).
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -670,6 +689,7 @@ extract_induced_subgraphs(
 // implementation) to support different types (arithmetic types or thrust tuple of arithmetic types)
 // of edge properties.
 /**
+ * @ingroup graph_functions_cpp
  * @brief create a graph from (the optional vertex list and) the given edge list (with optional edge
  * IDs and types).
  *
@@ -733,6 +753,7 @@ create_graph_from_edgelist(raft::handle_t const& handle,
                            bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief create a graph from (the optional vertex list and) the given edge list (with optional edge
  * IDs and types).
  *
@@ -799,6 +820,7 @@ create_graph_from_edgelist(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief      Find all 2-hop neighbors in the graph
  *
  * Find pairs of vertices in the input graph such that each pair is connected by
@@ -825,6 +847,7 @@ std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> get_two
   std::optional<raft::device_span<vertex_t const>> start_vertices);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Compute per-vertex incoming edge weight sums.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -852,6 +875,7 @@ rmm::device_uvector<weight_t> compute_in_weight_sums(
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Compute per-vertex outgoing edge weight sums.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -879,6 +903,7 @@ rmm::device_uvector<weight_t> compute_out_weight_sums(
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Compute maximum per-vertex incoming edge weight sums.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -906,6 +931,7 @@ weight_t compute_max_in_weight_sum(
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Compute maximum per-vertex outgoing edge weight sums.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -933,6 +959,7 @@ weight_t compute_max_out_weight_sum(
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Sum the weights of the entire set of edges.
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -959,6 +986,7 @@ weight_t compute_total_edge_weight(
   edge_property_view_t<edge_t, weight_t const*> edge_weight_view);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Select random vertices
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -991,6 +1019,7 @@ rmm::device_uvector<vertex_t> select_random_vertices(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Remove self loops from an edge list
  *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
@@ -1022,6 +1051,7 @@ remove_self_loops(raft::handle_t const& handle,
                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Remove all but one edge when a multi-edge exists.
  *
  * When a multi-edge exists, one of the edges will remain. If @p keep_min_value_edge is false, an
@@ -1069,6 +1099,7 @@ remove_multi_edges(raft::handle_t const& handle,
                    bool keep_min_value_edge = false);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Shuffle external vertex ids to the proper GPU.
  *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
@@ -1083,6 +1114,7 @@ rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& ha
                                                         rmm::device_uvector<vertex_t>&& vertices);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Shuffle external vertex ids and values to the proper GPU.
  *
  * @tparam vertex_t   Type of vertex identifiers. Needs to be an integral type.
@@ -1102,6 +1134,7 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
                                     rmm::device_uvector<value_t>&& values);
 
 /**
+ * @ingroup graph_functions_cpp
  * @brief Shuffle external edges to the proper GPU.
  *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
diff --git a/cpp/include/cugraph/graph_generators.hpp b/cpp/include/cugraph/graph_generators.hpp
index 5e8e97c51a2..7246f0b2fb7 100644
--- a/cpp/include/cugraph/graph_generators.hpp
+++ b/cpp/include/cugraph/graph_generators.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,13 @@
 #include <optional>
 #include <tuple>
 
+/** @defgroup graph_generators_cpp C++ Graph Generators
+ */
+
+/** @ingroup graph_generators_cpp
+ *  @{
+ */
+
 namespace cugraph {
 
 /**
@@ -536,3 +543,7 @@ combine_edgelists(raft::handle_t const& handle,
                   bool remove_multi_edges = true);
 
 }  // namespace cugraph
+
+/**
+ * @}
+ */
diff --git a/cpp/include/cugraph/legacy/functions.hpp b/cpp/include/cugraph/legacy/functions.hpp
index 51f05a6d26d..1e4156a68b3 100644
--- a/cpp/include/cugraph/legacy/functions.hpp
+++ b/cpp/include/cugraph/legacy/functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,13 @@
 
 namespace cugraph {
 
+/** @defgroup legacy_functions_cpp C++ Shuffle Wrappers
+ */
+
+/** @ingroup shuffle_wrappers_cpp
+ *  @{
+ */
+
 /**
  * @brief    Convert COO to CSR
  *
@@ -68,3 +75,7 @@ void comms_bcast(const raft::handle_t& handle, value_t* value, size_t count)
 }
 
 }  // namespace cugraph
+
+/**
+ * @}
+ */
diff --git a/cpp/include/cugraph/legacy/graph.hpp b/cpp/include/cugraph/legacy/graph.hpp
index 19cd5bbd6d0..18d57533d62 100644
--- a/cpp/include/cugraph/legacy/graph.hpp
+++ b/cpp/include/cugraph/legacy/graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,6 +48,13 @@ enum class DegreeDirection {
   DEGREE_DIRECTION_COUNT
 };
 
+/** @defgroup legacy_graph_cpp C++ Legacy Graph
+ */
+
+/** @ingroup legacy_graph_cpp
+ *  @{
+ */
+
 /**
  * @brief       Base class graphs, all but vertices and edges
  *
@@ -575,3 +582,7 @@ struct invalid_edge_id : invalid_idx<edge_t> {};
 }  // namespace cugraph
 
 #include "eidecl_graph.hpp"
+
+/**
+ * @}
+ */
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index 981c42135f6..35c51c1ea6d 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,9 +27,13 @@
 #include <optional>
 #include <tuple>
 
+/** @defgroup sampling_functions_cpp C++ Sampling Functions
+ */
+
 namespace cugraph {
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Controls how we treat prior sources in sampling
  *
  * @param DEFAULT    Add vertices encountered while sampling to the new frontier
@@ -41,6 +45,7 @@ namespace cugraph {
 enum class prior_sources_behavior_t { DEFAULT = 0, CARRY_OVER, EXCLUDE };
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Uniform Neighborhood Sampling.
  *
  * @deprecated Replaced with homogeneous_uniform_neighbor_sample
@@ -141,6 +146,7 @@ uniform_neighbor_sample(
   bool do_expensive_check                         = false);
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Biased Neighborhood Sampling.
  *
  * @deprecated Replaced with homogeneous_biased_neighbor_sample
@@ -273,6 +279,7 @@ struct sampling_flags_t {
 };
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Homogeneous Uniform Neighborhood Sampling.
  *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
@@ -347,6 +354,7 @@ homogeneous_uniform_neighbor_sample(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Homogeneous Biased Neighborhood Sampling.
  *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
@@ -428,6 +436,7 @@ homogeneous_biased_neighbor_sample(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Heterogeneous Uniform Neighborhood Sampling.
  *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
@@ -506,6 +515,7 @@ heterogeneous_uniform_neighbor_sample(
   bool do_expensive_check = false);
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Heterogeneous Biased Neighborhood Sampling.
  *
  * This function traverses from a set of starting vertices, traversing outgoing edges and
@@ -590,7 +600,8 @@ heterogeneous_biased_neighbor_sample(
   sampling_flags_t sampling_flags,
   bool do_expensive_check = false);
 
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief renumber sampled edge list and compress to the (D)CSR|(D)CSC format.
  *
  * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
@@ -715,7 +726,8 @@ renumber_and_compress_sampled_edgelist(
   bool doubly_compress    = false,
   bool do_expensive_check = false);
 
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief renumber sampled edge list and sort the renumbered edges.
  *
  * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
@@ -815,7 +827,8 @@ renumber_and_sort_sampled_edgelist(
   bool src_is_major       = true,
   bool do_expensive_check = false);
 
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief renumber sampled edge list (vertex & edge IDs) per vertex/edge type and sort the
  * renumbered edges.
  *
@@ -957,7 +970,8 @@ heterogeneous_renumber_and_sort_sampled_edgelist(
   bool src_is_major       = true,
   bool do_expensive_check = false);
 
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief sort sampled edge list.
  *
  * Sampled edges are sorted based on the following rules.
@@ -1024,7 +1038,8 @@ sort_sampled_edgelist(raft::handle_t const& handle,
                       size_t num_hops,
                       bool src_is_major       = true,
                       bool do_expensive_check = false);
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief Build map to lookup source and destination using edge id and type
  *
  * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
@@ -1047,7 +1062,8 @@ lookup_container_t<edge_t, edge_type_t, vertex_t> build_edge_id_and_type_to_src_
   edge_property_view_t<edge_t, edge_t const*> edge_id_view,
   edge_property_view_t<edge_t, edge_type_t const*> edge_type_view);
 
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief Lookup edge sources and destinations using edge ids and a single edge type.
  * Use this function to lookup endpoints of edges belonging to the same edge type.
  *
@@ -1074,7 +1090,8 @@ lookup_endpoints_from_edge_ids_and_single_type(
   raft::device_span<edge_t const> edge_ids_to_lookup,
   edge_type_t edge_type_to_lookup);
 
-/*
+/**
+ * @ingroup sampling_functions_cpp
  * @brief Lookup edge sources and destinations using edge ids and edge types.
  * Use this function to lookup endpoints of edges belonging to different edge types.
  *
@@ -1104,6 +1121,7 @@ lookup_endpoints_from_edge_ids_and_types(
   raft::device_span<edge_type_t const> edge_types_to_lookup);
 
 /**
+ * @ingroup sampling_functions_cpp
  * @brief Negative Sampling
  *
  * This function generates negative samples for graph.
diff --git a/python/cugraph-service/pytest.ini b/python/cugraph-service/pytest.ini
index f2ba9175f82..fedf8d286d4 100644
--- a/python/cugraph-service/pytest.ini
+++ b/python/cugraph-service/pytest.ini
@@ -17,8 +17,6 @@ addopts = --benchmark-warmup=off
           --benchmark-min-rounds=1
           --benchmark-columns="min, max, mean, rounds"
           --tb=native
-          ## for use with rapids-pytest-benchmark plugin
-          #--benchmark-gpu-disable
           ## for use with pytest-cov plugin
           #--cov=cugraph
           #--cov-report term-missing:skip-covered
diff --git a/python/cugraph/cugraph/tests/components/test_connectivity.py b/python/cugraph/cugraph/tests/components/test_connectivity.py
index df45e055c5e..67684b386a4 100644
--- a/python/cugraph/cugraph/tests/components/test_connectivity.py
+++ b/python/cugraph/cugraph/tests/components/test_connectivity.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -289,7 +289,7 @@ def single_dataset_nxresults_strong(request):
 # =============================================================================
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
-def test_weak_cc(gpubenchmark, dataset_nxresults_weak, cugraph_input_type):
+def test_weak_cc(benchmark, dataset_nxresults_weak, cugraph_input_type):
     (
         G,
         dataset_path,
@@ -312,7 +312,7 @@ def test_weak_cc(gpubenchmark, dataset_nxresults_weak, cugraph_input_type):
     else:
         input_G_or_matrix = G
     cugraph_labels = cugraph_call(
-        gpubenchmark, cugraph.weakly_connected_components, input_G_or_matrix, directed
+        benchmark, cugraph.weakly_connected_components, input_G_or_matrix, directed
     )
 
     # while cugraph returns a component label for each vertex;
@@ -347,14 +347,14 @@ def test_weak_cc(gpubenchmark, dataset_nxresults_weak, cugraph_input_type):
     "cugraph_input_type", utils.NX_DIR_INPUT_TYPES + utils.MATRIX_INPUT_TYPES
 )
 def test_weak_cc_nonnative_inputs(
-    gpubenchmark, single_dataset_nxresults_weak, cugraph_input_type
+    benchmark, single_dataset_nxresults_weak, cugraph_input_type
 ):
-    test_weak_cc(gpubenchmark, single_dataset_nxresults_weak, cugraph_input_type)
+    test_weak_cc(benchmark, single_dataset_nxresults_weak, cugraph_input_type)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
-def test_strong_cc(gpubenchmark, dataset_nxresults_strong, cugraph_input_type):
+def test_strong_cc(benchmark, dataset_nxresults_strong, cugraph_input_type):
 
     # NetX returns a list of components, each component being a
     # collection (set{}) of vertex indices
@@ -374,7 +374,7 @@ def test_strong_cc(gpubenchmark, dataset_nxresults_strong, cugraph_input_type):
     else:
         input_G_or_matrix = G
     cugraph_labels = cugraph_call(
-        gpubenchmark, cugraph.strongly_connected_components, input_G_or_matrix
+        benchmark, cugraph.strongly_connected_components, input_G_or_matrix
     )
 
     if isinstance(cugraph_input_type, cugraph.Graph):
@@ -413,9 +413,9 @@ def test_strong_cc(gpubenchmark, dataset_nxresults_strong, cugraph_input_type):
     "cugraph_input_type", utils.NX_DIR_INPUT_TYPES + utils.MATRIX_INPUT_TYPES
 )
 def test_strong_cc_nonnative_inputs(
-    gpubenchmark, single_dataset_nxresults_strong, cugraph_input_type
+    benchmark, single_dataset_nxresults_strong, cugraph_input_type
 ):
-    test_strong_cc(gpubenchmark, single_dataset_nxresults_strong, cugraph_input_type)
+    test_strong_cc(benchmark, single_dataset_nxresults_strong, cugraph_input_type)
 
 
 @pytest.mark.sg
diff --git a/python/cugraph/cugraph/tests/conftest.py b/python/cugraph/cugraph/tests/conftest.py
index 101a4e6a192..79747f32eb7 100644
--- a/python/cugraph/cugraph/tests/conftest.py
+++ b/python/cugraph/cugraph/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,7 +17,6 @@
     stop_dask_client,
 )
 
-import os
 import tempfile
 
 # Avoid timeout during shutdown
@@ -37,17 +36,6 @@ def pytest_sessionstart(session):
 
 # module-wide fixtures
 
-# Spoof the gpubenchmark fixture if it's not available so that asvdb and
-# rapids-pytest-benchmark do not need to be installed to run tests.
-if "gpubenchmark" not in globals():
-
-    def benchmark_func(func, *args, **kwargs):
-        return func(*args, **kwargs)
-
-    @pytest.fixture
-    def gpubenchmark():
-        return benchmark_func
-
 
 @pytest.fixture(scope="module")
 def dask_client():
@@ -82,9 +70,7 @@ def dask_client_non_p2p():
 def scratch_dir():
     # This should always be set if doing MG testing, since temporary
     # directories are only accessible from the current process.
-    tempdir_object = os.getenv(
-        "RAPIDS_PYTEST_SCRATCH_DIR", tempfile.TemporaryDirectory()
-    )
+    tempdir_object = tempfile.TemporaryDirectory()
 
     if isinstance(tempdir_object, tempfile.TemporaryDirectory):
         yield tempdir_object.name
diff --git a/python/cugraph/cugraph/tests/data_store/test_property_graph.py b/python/cugraph/cugraph/tests/data_store/test_property_graph.py
index 50f08cdf3d0..b231fcaf0ba 100644
--- a/python/cugraph/cugraph/tests/data_store/test_property_graph.py
+++ b/python/cugraph/cugraph/tests/data_store/test_property_graph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -27,18 +27,6 @@
 from pylibcugraph.testing.utils import gen_fixture_params_product
 
 
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
 # FIXME: remove when fully-migrated to pandas 1.5.0
 try:
     # pandas 1.5.0
@@ -2513,19 +2501,19 @@ def test_types_from_numerals():
 # =============================================================================
 # Benchmarks
 # =============================================================================
-def bench_num_vertices(gpubenchmark, dataset1_PropertyGraph):
+def bench_num_vertices(benchmark, dataset1_PropertyGraph):
     (pG, data) = dataset1_PropertyGraph
 
-    assert gpubenchmark(pG.get_num_vertices) == 9
+    assert benchmark(pG.get_num_vertices) == 9
 
 
-def bench_get_vertices(gpubenchmark, dataset1_PropertyGraph):
+def bench_get_vertices(benchmark, dataset1_PropertyGraph):
     (pG, data) = dataset1_PropertyGraph
 
-    gpubenchmark(pG.get_vertices)
+    benchmark(pG.get_vertices)
 
 
-def bench_extract_subgraph_for_cyber(gpubenchmark, cyber_PropertyGraph):
+def bench_extract_subgraph_for_cyber(benchmark, cyber_PropertyGraph):
     from cugraph.experimental import PropertyGraph
 
     pG = cyber_PropertyGraph
@@ -2535,7 +2523,7 @@ def bench_extract_subgraph_for_cyber(gpubenchmark, cyber_PropertyGraph):
     # Create a Graph containing only specific src or dst vertices
     verts = ["10.40.182.3", "10.40.182.255", "59.166.0.9", "59.166.0.8"]
     selected_edges = pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})")
-    gpubenchmark(
+    benchmark(
         pG.extract_subgraph,
         create_using=cugraph.Graph(directed=True),
         selection=selected_edges,
@@ -2545,7 +2533,7 @@ def bench_extract_subgraph_for_cyber(gpubenchmark, cyber_PropertyGraph):
 
 
 def bench_extract_subgraph_for_cyber_detect_duplicate_edges(
-    gpubenchmark, cyber_PropertyGraph
+    benchmark, cyber_PropertyGraph
 ):
     from cugraph.experimental import PropertyGraph
 
@@ -2566,10 +2554,10 @@ def func():
                 check_multi_edges=True,
             )
 
-    gpubenchmark(func)
+    benchmark(func)
 
 
-def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
+def bench_extract_subgraph_for_rmat(benchmark, rmat_PropertyGraph):
     from cugraph.experimental import PropertyGraph
 
     (pG, generated_df) = rmat_PropertyGraph
@@ -2582,7 +2570,7 @@ def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
     verts = [int(generated_df["src"].iloc[i]) for i in range(0, 10000, 10)]
 
     selected_edges = pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})")
-    gpubenchmark(
+    benchmark(
         pG.extract_subgraph,
         create_using=cugraph.Graph(directed=True),
         selection=selected_edges,
@@ -2593,7 +2581,7 @@ def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
 
 @pytest.mark.slow
 @pytest.mark.parametrize("n_rows", [15_000_000, 30_000_000, 60_000_000, 120_000_000])
-def bench_add_edge_data(gpubenchmark, n_rows):
+def bench_add_edge_data(benchmark, n_rows):
     from cugraph.experimental import PropertyGraph
 
     def func():
@@ -2603,7 +2591,7 @@ def func():
         df = cudf.DataFrame({"src": src, "dst": dst})
         pg.add_edge_data(df, ["src", "dst"], type_name="('_N', '_E', '_N')")
 
-    gpubenchmark(func)
+    benchmark(func)
 
 
 # This test runs for *minutes* with the current implementation, and since
@@ -2611,7 +2599,7 @@ def func():
 # test can be ~20 minutes.
 @pytest.mark.slow
 def bench_extract_subgraph_for_rmat_detect_duplicate_edges(
-    gpubenchmark, rmat_PropertyGraph
+    benchmark, rmat_PropertyGraph
 ):
     from cugraph.experimental import PropertyGraph
 
@@ -2635,12 +2623,12 @@ def func():
                 check_multi_edges=True,
             )
 
-    gpubenchmark(func)
+    benchmark(func)
 
 
 @pytest.mark.slow
 @pytest.mark.parametrize("N", [1, 3, 10, 30])
-def bench_add_edges_cyber(gpubenchmark, N):
+def bench_add_edges_cyber(benchmark, N):
     from cugraph.experimental import PropertyGraph
 
     # Partition the dataframe to add in chunks
@@ -2655,13 +2643,13 @@ def func():
         df = pG.get_edge_data()
         assert len(df) == len(cyber_df)
 
-    gpubenchmark(func)
+    benchmark(func)
 
 
 # @pytest.mark.slow
 @pytest.mark.parametrize("n_rows", [10_000, 100_000, 1_000_000, 10_000_000])
 @pytest.mark.parametrize("n_feats", [32, 64, 128])
-def bench_add_vector_features(gpubenchmark, n_rows, n_feats):
+def bench_add_vector_features(benchmark, n_rows, n_feats):
     from cugraph.experimental import PropertyGraph
 
     df = cudf.DataFrame(
@@ -2681,7 +2669,7 @@ def func():
             df, vertex_col_names=["src", "dst"], vector_properties=vector_properties
         )
 
-    gpubenchmark(func)
+    benchmark(func)
 
 
 # @pytest.mark.slow
diff --git a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
index 350f5069f11..97a13bfdf04 100644
--- a/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
+++ b/python/cugraph/cugraph/tests/data_store/test_property_graph_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -28,18 +28,6 @@
 from cugraph.dask.common.mg_utils import is_single_gpu
 from cugraph.datasets import cyber, netscience
 
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
 
 def type_is_categorical(pG):
     return (
@@ -1517,7 +1505,7 @@ def test_renumber_by_type_only_default_type(dask_client):
 
 @pytest.mark.slow
 @pytest.mark.parametrize("N", [1, 3, 10, 30])
-def bench_add_edges_cyber(gpubenchmark, dask_client, N):
+def bench_add_edges_cyber(benchmark, dask_client, N):
     from cugraph.experimental import MGPropertyGraph
 
     # Partition the dataframe to add in chunks
@@ -1535,13 +1523,13 @@ def func():
         df = mpG.get_edge_data().compute()
         assert len(df) == len(cyber_df)
 
-    gpubenchmark(func)
+    benchmark(func)
 
 
 @pytest.mark.slow
 @pytest.mark.parametrize("n_rows", [1_000_000])
 @pytest.mark.parametrize("n_feats", [128])
-def bench_get_vector_features(gpubenchmark, dask_client, n_rows, n_feats):
+def bench_get_vector_features(benchmark, dask_client, n_rows, n_feats):
     from cugraph.experimental import MGPropertyGraph
 
     df = cudf.DataFrame(
@@ -1564,4 +1552,4 @@ def func(pG):
         df = pG.get_edge_data(edge_ids=cp.arange(0, 100_000))
         df = df.compute()
 
-    gpubenchmark(func, pG)
+    benchmark(func, pG)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index ed3a796121c..2fd31fe53ba 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -218,10 +218,10 @@ def read_csv(request):
 
 @pytest.mark.sg
 @pytest.mark.parametrize("use_weight", [False, True])
-def test_jaccard(read_csv, gpubenchmark, use_weight):
+def test_jaccard(read_csv, benchmark, use_weight):
     M_cu, M, graph_file = read_csv
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight
+        benchmark, graph_file, input_df=M_cu, use_weight=use_weight
     )
 
     nx_src, nx_dst, nx_coeff = networkx_call(M)
@@ -262,20 +262,20 @@ def test_directed_graph_check(read_csv, use_weight):
 
 
 @pytest.mark.sg
-def test_nx_jaccard_time(read_csv, gpubenchmark):
+def test_nx_jaccard_time(read_csv, benchmark):
     _, M, _ = read_csv
-    nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark)
+    nx_src, nx_dst, nx_coeff = networkx_call(M, benchmark)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
 @pytest.mark.parametrize("use_weight", [False, True])
-def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight):
+def test_jaccard_edgevals(benchmark, graph_file, use_weight):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
     M_cu = utils.read_csv_file(dataset_path)
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight
+        benchmark, netscience, input_df=M_cu, use_weight=use_weight
     )
     if not use_weight:
         nx_src, nx_dst, nx_coeff = networkx_call(M)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
index f87fe06f691..21381603bf5 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -198,11 +198,11 @@ def extract_two_hop(read_csv):
 # Test
 @pytest.mark.sg
 @pytest.mark.parametrize("use_weight", [False, True])
-def test_overlap(gpubenchmark, read_csv, extract_two_hop, use_weight):
+def test_overlap(benchmark, read_csv, extract_two_hop, use_weight):
     M, graph_file = read_csv
     pairs = extract_two_hop
 
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, use_weight=use_weight)
+    cu_coeff = cugraph_call(benchmark, graph_file, pairs, use_weight=use_weight)
     cpu_coeff = cpu_call(M, pairs[VERTEX_PAIR_FIRST_COL], pairs[VERTEX_PAIR_SECOND_COL])
 
     compare_overlap(cu_coeff, cpu_coeff)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
index 5369398fa16..23749ef335b 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -223,10 +223,10 @@ def read_csv(request):
 
 @pytest.mark.sg
 @pytest.mark.parametrize("use_weight", [False, True])
-def test_sorensen(gpubenchmark, read_csv, use_weight):
+def test_sorensen(benchmark, read_csv, use_weight):
     M_cu, M, graph_file = read_csv
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight
+        benchmark, graph_file, input_df=M_cu, use_weight=use_weight
     )
     nx_src, nx_dst, nx_coeff = networkx_call(M)
 
@@ -244,9 +244,9 @@ def test_sorensen(gpubenchmark, read_csv, use_weight):
 
 
 @pytest.mark.sg
-def test_nx_sorensen_time(gpubenchmark, read_csv):
+def test_nx_sorensen_time(benchmark, read_csv):
     _, M, _ = read_csv
-    nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark)
+    nx_src, nx_dst, nx_coeff = networkx_call(M, benchmark)
 
 
 @pytest.mark.sg
@@ -277,12 +277,12 @@ def test_directed_graph_check(read_csv, use_weight):
 @pytest.mark.parametrize("graph_file", [netscience])
 @pytest.mark.parametrize("use_weight", [False, True])
 @pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
-def test_sorensen_edgevals(gpubenchmark, graph_file, use_weight):
+def test_sorensen_edgevals(benchmark, graph_file, use_weight):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
     M_cu = utils.read_csv_file(dataset_path)
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight
+        benchmark, netscience, input_df=M_cu, use_weight=use_weight
     )
     nx_src, nx_dst, nx_coeff = networkx_call(M)
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
index 6343b0ff9f3..abded010478 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,18 +31,6 @@
 from cugraph.datasets import email_Eu_core, small_tree
 from pylibcugraph.testing.utils import gen_fixture_params_product
 
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
 # =============================================================================
 # Pytest Setup / Teardown - called for each test function
 # =============================================================================
@@ -1255,7 +1243,7 @@ def test_uniform_neighbor_sample_dcsr_dcsc_local():
 @pytest.mark.mg
 @pytest.mark.slow
 @pytest.mark.parametrize("n_samples", [1_000, 5_000, 10_000])
-def bench_uniform_neighbor_sample_email_eu_core(gpubenchmark, dask_client, n_samples):
+def bench_uniform_neighbor_sample_email_eu_core(benchmark, dask_client, n_samples):
     input_data_path = email_Eu_core.get_path()
     chunksize = dcg.get_chunksize(input_data_path)
 
@@ -1283,4 +1271,4 @@ def func():
         _ = cugraph.dask.uniform_neighbor_sample(dg, start_list, [10])
         del _
 
-    gpubenchmark(func)
+    benchmark(func)
diff --git a/python/cugraph/cugraph/tests/traversal/test_bfs.py b/python/cugraph/cugraph/tests/traversal/test_bfs.py
index 164963848ad..8f4b6f00731 100644
--- a/python/cugraph/cugraph/tests/traversal/test_bfs.py
+++ b/python/cugraph/cugraph/tests/traversal/test_bfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -378,7 +378,7 @@ def single_dataset_goldenresults_startvertex_spc(
 # =============================================================================
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_INPUT_TYPES)
-def test_bfs(gpubenchmark, dataset_goldenresults_startvertex_spc, cugraph_input_type):
+def test_bfs(benchmark, dataset_goldenresults_startvertex_spc, cugraph_input_type):
     """
     Test BFS traversal on random source with distance and predecessors
     """
@@ -400,16 +400,16 @@ def test_bfs(gpubenchmark, dataset_goldenresults_startvertex_spc, cugraph_input_
     else:
         G_or_matrix = G
 
-    compare_bfs(gpubenchmark, G_or_matrix, golden_values, start_vertex, depth_limit)
+    compare_bfs(benchmark, G_or_matrix, golden_values, start_vertex, depth_limit)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.MATRIX_INPUT_TYPES)
 def test_bfs_nonnative_inputs_matrix(
-    gpubenchmark, single_dataset_goldenresults_startvertex_spc, cugraph_input_type
+    benchmark, single_dataset_goldenresults_startvertex_spc, cugraph_input_type
 ):
     test_bfs(
-        gpubenchmark, single_dataset_goldenresults_startvertex_spc, cugraph_input_type
+        benchmark, single_dataset_goldenresults_startvertex_spc, cugraph_input_type
     )
 
 
diff --git a/python/cugraph/cugraph/tests/traversal/test_sssp.py b/python/cugraph/cugraph/tests/traversal/test_sssp.py
index ceb6040275d..2dc8754507c 100644
--- a/python/cugraph/cugraph/tests/traversal/test_sssp.py
+++ b/python/cugraph/cugraph/tests/traversal/test_sssp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -220,7 +220,7 @@ def single_dataset_source_goldenresults_weighted(request):
 # =============================================================================
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
-def test_sssp(gpubenchmark, dataset_source_goldenresults, cugraph_input_type):
+def test_sssp(benchmark, dataset_source_goldenresults, cugraph_input_type):
     # Extract the params generated from the fixture
     (G, dataset_path, _, source, golden_paths) = dataset_source_goldenresults
 
@@ -230,7 +230,7 @@ def test_sssp(gpubenchmark, dataset_source_goldenresults, cugraph_input_type):
         )
     else:
         input_G_or_matrix = G
-    cu_paths, max_val = cugraph_call(gpubenchmark, input_G_or_matrix, source)
+    cu_paths, max_val = cugraph_call(benchmark, input_G_or_matrix, source)
 
     # Calculating mismatch
     err = 0
@@ -255,7 +255,7 @@ def test_sssp(gpubenchmark, dataset_source_goldenresults, cugraph_input_type):
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
 def test_sssp_invalid_start(
-    gpubenchmark, dataset_source_goldenresults, cugraph_input_type
+    benchmark, dataset_source_goldenresults, cugraph_input_type
 ):
     (G, _, _, source, _) = dataset_source_goldenresults
     el = G.view_edge_list()
@@ -264,15 +264,15 @@ def test_sssp_invalid_start(
     source = newval
 
     with pytest.raises(ValueError):
-        cugraph_call(gpubenchmark, G, source)
+        cugraph_call(benchmark, G, source)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.MATRIX_INPUT_TYPES)
 def test_sssp_nonnative_inputs_matrix(
-    gpubenchmark, single_dataset_source_goldenresults, cugraph_input_type
+    benchmark, single_dataset_source_goldenresults, cugraph_input_type
 ):
-    test_sssp(gpubenchmark, single_dataset_source_goldenresults, cugraph_input_type)
+    test_sssp(benchmark, single_dataset_source_goldenresults, cugraph_input_type)
 
 
 @pytest.mark.sg
@@ -319,14 +319,14 @@ def test_sssp_nonnative_inputs_graph(single_dataset_source_goldenresults, direct
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
 def test_sssp_edgevals(
-    gpubenchmark, dataset_source_goldenresults_weighted, cugraph_input_type
+    benchmark, dataset_source_goldenresults_weighted, cugraph_input_type
 ):
     # Extract the params generated from the fixture
     (G, _, _, source, golden_paths) = dataset_source_goldenresults_weighted
     input_G_or_matrix = G
 
     cu_paths, max_val = cugraph_call(
-        gpubenchmark, input_G_or_matrix, source, edgevals=True
+        benchmark, input_G_or_matrix, source, edgevals=True
     )
 
     # Calculating mismatch
@@ -357,10 +357,10 @@ def test_sssp_edgevals(
     "cugraph_input_type", utils.NX_DIR_INPUT_TYPES + utils.MATRIX_INPUT_TYPES
 )
 def test_sssp_edgevals_nonnative_inputs(
-    gpubenchmark, single_dataset_source_goldenresults_weighted, cugraph_input_type
+    benchmark, single_dataset_source_goldenresults_weighted, cugraph_input_type
 ):
     test_sssp_edgevals(
-        gpubenchmark, single_dataset_source_goldenresults_weighted, cugraph_input_type
+        benchmark, single_dataset_source_goldenresults_weighted, cugraph_input_type
     )
 
 
diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini
index 7bdef2d9771..3aa4cc5680e 100644
--- a/python/cugraph/pytest.ini
+++ b/python/cugraph/pytest.ini
@@ -20,8 +20,6 @@ addopts =
            --tb=native
            ## do not run the slow tests/benchmarks by default
            -m "not slow"
-           ## for use with rapids-pytest-benchmark plugin
-           #--benchmark-gpu-disable
            ## for use with pytest-cov plugin
            #--cov=cugraph
            #--cov-report term-missing:skip-covered
diff --git a/python/pylibcugraph/pylibcugraph/tests/conftest.py b/python/pylibcugraph/pylibcugraph/tests/conftest.py
index 228147a6e9f..e85a1f62029 100644
--- a/python/pylibcugraph/pylibcugraph/tests/conftest.py
+++ b/python/pylibcugraph/pylibcugraph/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,18 +20,6 @@
 from pylibcugraph.testing import utils
 
 
-# Spoof the gpubenchmark fixture if it's not available so that asvdb and
-# rapids-pytest-benchmark do not need to be installed to run tests.
-if "gpubenchmark" not in globals():
-
-    def benchmark_func(func, *args, **kwargs):
-        return func(*args, **kwargs)
-
-    @pytest.fixture
-    def gpubenchmark():
-        return benchmark_func
-
-
 # =============================================================================
 # Fixture parameters
 # =============================================================================
diff --git a/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py b/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
index 4dafeb19032..ae3c31bd4db 100644
--- a/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
+++ b/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -183,7 +183,7 @@ def test_neighborhood_sampling_cudf(
     )
 
 
-def test_neighborhood_sampling_large_sg_graph(gpubenchmark):
+def test_neighborhood_sampling_large_sg_graph(benchmark):
     """
     Use a large SG graph and set input args accordingly to test/benchmark
     returning a large result.
@@ -221,7 +221,7 @@ def test_neighborhood_sampling_large_sg_graph(gpubenchmark):
     device = cp.cuda.Device(0)
     free_memory_before = device.mem_info[0]
 
-    result = gpubenchmark(
+    result = benchmark(
         uniform_neighbor_sample,
         resource_handle,
         sg,
diff --git a/python/utils/asv_report.py b/python/utils/asv_report.py
deleted file mode 100644
index 75144f1cea4..00000000000
--- a/python/utils/asv_report.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import platform
-
-import psutil
-
-from asvdb import BenchmarkInfo, BenchmarkResult, ASVDb
-from utils import getCommitInfo, getRepoInfo
-
-
-def cugraph_update_asv(
-    asvDir,
-    datasetName,
-    algoRunResults,
-    cudaVer="",
-    pythonVer="",
-    osType="",
-    machineName="",
-    repo="",
-):
-    """
-    algoRunResults is a list of (algoName, exeTime) tuples
-    """
-    (commitHash, commitTime) = getCommitInfo()
-    (actualRepo, branch) = getRepoInfo()
-    repo = repo or actualRepo
-
-    db = ASVDb(asvDir, repo, [branch])
-
-    uname = platform.uname()
-
-    prefixDict = dict(
-        maxGpuUtil="gpuutil",
-        maxGpuMemUsed="gpumem",
-        exeTime="time",
-    )
-    unitsDict = dict(
-        maxGpuUtil="percent",
-        maxGpuMemUsed="bytes",
-        exeTime="seconds",
-    )
-
-    bInfo = BenchmarkInfo(
-        machineName=machineName or uname.machine,
-        cudaVer=cudaVer or "unknown",
-        osType=osType or "%s %s" % (uname.system, uname.release),
-        pythonVer=pythonVer or platform.python_version(),
-        commitHash=commitHash,
-        commitTime=commitTime,
-        gpuType="unknown",
-        cpuType=uname.processor,
-        arch=uname.machine,
-        ram="%d" % psutil.virtual_memory().total,
-    )
-
-    validKeys = set(list(prefixDict.keys()) + list(unitsDict.keys()))
-
-    for (funcName, metricsDict) in algoRunResults.items():
-        for (metricName, val) in metricsDict.items():
-            # If an invalid metricName is present (likely due to a benchmark
-            # run error), skip
-            if metricName in validKeys:
-                bResult = BenchmarkResult(
-                    funcName="%s_%s" % (funcName, prefixDict[metricName]),
-                    argNameValuePairs=[("dataset", datasetName)],
-                    result=val,
-                )
-                bResult.unit = unitsDict[metricName]
-                db.addResult(bInfo, bResult)
-
-
-if __name__ == "__main__":
-    # Test ASVDb with some mock data (that just so happens to be very similar
-    # to actual data)
-    # FIXME: consider breaking this out to a proper test_whatever.py file!
-    asvDir = "asv"
-
-    datasetName = "dolphins.csv"
-    algoRunResults = [
-        ("loadDataFile", 3.2228727098554373),
-        ("createGraph", 3.00713360495865345),
-        ("pagerank", 3.00899268127977848),
-        ("bfs", 3.004273353144526482),
-        ("sssp", 3.004624705761671066),
-        ("jaccard", 3.0025573652237653732),
-        ("louvain", 3.32631026208400726),
-        ("weakly_connected_components", 3.0034315641969442368),
-        ("overlap", 3.002147899940609932),
-        ("triangles", 3.2544921860098839),
-        ("spectralBalancedCutClustering", 3.03329935669898987),
-        ("spectralModularityMaximizationClustering", 3.011258183047175407),
-        ("renumber", 3.001620553433895111),
-        ("view_adj_list", 3.000927431508898735),
-        ("degree", 3.0016251634806394577),
-        ("degrees", None),
-    ]
-    cugraph_update_asv(
-        asvDir, datasetName, algoRunResults, machineName="MN", pythonVer="3.6"
-    )
-
-    # Same arg values (the "datasetName" is still named "dolphins.csv"), but
-    # different results - this should override just the results.
-    algoRunResults = [(a, r + 1) for (a, r) in algoRunResults]
-    cugraph_update_asv(
-        asvDir, datasetName, algoRunResults, machineName="MN", pythonVer="3.6"
-    )
-
-    # New arg values (changed "datasetName" to "dolphins2.csv") - this should
-    # create a new set or arg values and results.
-    datasetName = "dolphins2.csv"
-    cugraph_update_asv(
-        asvDir, datasetName, algoRunResults, machineName="MN", pythonVer="3.6"
-    )
diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py
index 854552fb34f..dffbd259b0e 100755
--- a/python/utils/gpu_metric_poller.py
+++ b/python/utils/gpu_metric_poller.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,7 +31,7 @@
 import os
 import sys
 import threading
-from pynvml import smi
+import pynvml
 
 
 class GPUMetricPoller(threading.Thread):
@@ -91,18 +91,18 @@ def __runChildLoop(self, readFileNo, writeFileNo):
         childReadPipe = os.fdopen(readFileNo)
         childWritePipe = os.fdopen(writeFileNo, "w")
 
-        smi.nvmlInit()
+        pynvml.nvmlInit()
         # hack - get actual device ID somehow
-        devObj = smi.nvmlDeviceGetHandleByIndex(0)
-        memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-        utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
+        devObj = pynvml.nvmlDeviceGetHandleByIndex(0)
+        memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
+        utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
         initialMemUsed = memObj.used
         initialGpuUtil = utilObj.gpu
 
         controlStr = self.__waitForInput(childReadPipe)
         while True:
-            memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-            utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
+            memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
+            utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
 
             memUsed = memObj.used - initialMemUsed
             gpuUtil = utilObj.gpu - initialGpuUtil
@@ -113,7 +113,7 @@ def __runChildLoop(self, readFileNo, writeFileNo):
                 break
             controlStr = self.__waitForInput(childReadPipe)
 
-        smi.nvmlShutdown()
+        pynvml.nvmlShutdown()
         childReadPipe.close()
         childWritePipe.close()
 
@@ -147,34 +147,3 @@ def startGpuMetricPolling():
 def stopGpuMetricPolling(gpuPollObj):
     gpuPollObj.stop()
     gpuPollObj.join()  # consider using timeout and reporting errors
-
-
-"""
-smi.nvmlInit()
-# hack - get actual device ID somehow
-devObj = smi.nvmlDeviceGetHandleByIndex(0)
-memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
-initialMemUsed = memObj.used
-initialGpuUtil = utilObj.gpu
-
-while not self.__stop:
-    time.sleep(0.01)
-
-    memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-    utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
-
-    memUsed = memObj.used - initialMemUsed
-    gpuUtil = utilObj.gpu - initialGpuUtil
-    if memUsed > self.maxGpuMemUsed:
-        self.maxGpuMemUsed = memUsed
-    if gpuUtil > self.maxGpuUtil:
-        self.maxGpuUtil = gpuUtil
-
-    smi.nvmlShutdown()
-"""
-
-
-# if __name__ == "__main__":
-#     sto=stopGpuMetricPolling
-#     po = startGpuMetricPolling()
diff --git a/python/utils/run_benchmarks.py b/python/utils/run_benchmarks.py
index e99b9943327..34457df11bf 100644
--- a/python/utils/run_benchmarks.py
+++ b/python/utils/run_benchmarks.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -239,11 +239,6 @@ def parseCLI(argv):
         type=str,
         help="Add (and compare) results to the dir specified",
     )
-    parser.add_argument(
-        "--update_asv_dir",
-        type=str,
-        help="Add results to the specified ASV dir in ASV " "format",
-    )
     parser.add_argument(
         "--report_cuda_ver",
         type=str,
@@ -358,22 +353,3 @@ def __call__(self, *args, **kwargs):
     # reports ########################
     if args.update_results_dir:
         raise NotImplementedError
-
-    if args.update_asv_dir:
-        # import this here since it pulls in a 3rd party package (asvdb) which
-        # may not be appreciated by non-ASV users.
-        from asv_report import cugraph_update_asv
-
-        # special case: do not include the full path to the datasetName, since
-        # the leading parts are redundant and take up UI space.
-        datasetName = "/".join(args.file.split("/")[-3:])
-
-        cugraph_update_asv(
-            asvDir=args.update_asv_dir,
-            datasetName=datasetName,
-            algoRunResults=Benchmark.resultsDict,
-            cudaVer=args.report_cuda_ver,
-            pythonVer=args.report_python_ver,
-            osType=args.report_os_type,
-            machineName=args.report_machine_name,
-        )
diff --git a/python/utils/run_benchmarks.sh b/python/utils/run_benchmarks.sh
index 4c3e7a288f1..6cb90d46ec8 100755
--- a/python/utils/run_benchmarks.sh
+++ b/python/utils/run_benchmarks.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,11 +21,6 @@ DATASET_DIR=${DATASET_DIR:=${THISDIR}/../../datasets}
 MACHINE_NAME=${MACHINE_NAME:="mymachine"}
 CONDA=${CONDA:=conda}
 
-# To output results for use with ASV, set
-# ASV_OUTPUT_OPTION="--update_asv_dir=/asv/cugraph-e2e" (update /asv/cugraph-e2e
-# to the desired results dir)
-ASV_OUTPUT_OPTION=${ASV_OUTPUT_OPTION:=""}
-
 ERROR=0
 for ds in ${DATASET_DIR}/csv/undirected/*; do
    echo "================================ ${ds}"
@@ -35,7 +30,6 @@ for ds in ${DATASET_DIR}/csv/undirected/*; do
       echo
    else
        python ${UTILS_DIR}/run_benchmarks.py \
-              ${ASV_OUTPUT_OPTION} \
               --report_cuda_ver=${CUDA_VERSION} \
               --report_python_ver=${PYTHON_VERSION} \
               --report_os_type=${LINUX_VERSION} \
@@ -57,7 +51,6 @@ for ds in ${DATASET_DIR}/csv/undirected/*; do
               \
               ${ds}
        python ${UTILS_DIR}/run_benchmarks.py \
-              ${ASV_OUTPUT_OPTION} \
               --report_cuda_ver=${CUDA_VERSION} \
               --report_python_ver=${PYTHON_VERSION} \
               --report_os_type=${LINUX_VERSION} \
@@ -78,7 +71,6 @@ done
 for ds in ${DATASET_DIR}/csv/directed/*; do
    echo "================================ ${ds}"
    python ${UTILS_DIR}/run_benchmarks.py \
-          ${ASV_OUTPUT_OPTION} \
           --report_cuda_ver=${CUDA_VERSION} \
           --report_python_ver=${PYTHON_VERSION} \
           --report_os_type=${LINUX_VERSION} \
@@ -95,7 +87,6 @@ for ds in ${DATASET_DIR}/csv/directed/*; do
           \
           ${ds}
    python ${UTILS_DIR}/run_benchmarks.py \
-          ${ASV_OUTPUT_OPTION} \
           --report_cuda_ver=${CUDA_VERSION} \
           --report_python_ver=${PYTHON_VERSION} \
           --report_os_type=${LINUX_VERSION} \