Merge branch 'branch-25.02' into streams-final

rapidsai · Jan 29, 2025 · 77e3ae4 · 77e3ae4
2 parents cf8fc92 + 39bcd16
commit 77e3ae4
Show file tree

Hide file tree

Showing 66 changed files with 767 additions and 332 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   autoupdate_branch: ""
   autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
   autoupdate_schedule: quarterly
-  skip: ["verify-alpha-spec", "nbqa-isort"]
+  skip: ["verify-alpha-spec"]
   submodules: false
 
 repos:
@@ -41,13 +41,6 @@ repos:
                "python/cudf_polars/cudf_polars",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
-  - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.9.1
-    hooks:
-      - id: nbqa-isort
-        # Use the cudf_kafka isort orderings in notebooks so that dask
-        # and RAPIDS packages have their own sections.
-        args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
@@ -153,13 +146,11 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.0
+    rev: v0.9.3
     hooks:
       - id: ruff
         args: ["--fix"]
-        files: python/.*$
       - id: ruff-format
-        files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
     rev: v0.4.0
     hooks:

diff --git a/ci/cudf_pandas_scripts/fetch_pandas_versions.py b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
@@ -1,24 +1,34 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+
+import argparse
 
 import requests
-from packaging.version import Version
 from packaging.specifiers import SpecifierSet
-import argparse
+from packaging.version import Version
+
 
 def get_pandas_versions(pandas_range):
     url = "https://pypi.org/pypi/pandas/json"
     response = requests.get(url)
     data = response.json()
-    versions = [Version(v) for v in data['releases']]
+    versions = [Version(v) for v in data["releases"]]
     specifier = SpecifierSet(pandas_range.lstrip("pandas"))
     matching_versions = [v for v in versions if v in specifier]
-    matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version)
+    matching_minors = sorted(
+        set(".".join((str(v.major), str(v.minor))) for v in matching_versions),
+        key=Version,
+    )
     return matching_minors
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.")
-    parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.")
+    parser = argparse.ArgumentParser(
+        description="Filter pandas versions by prefix."
+    )
+    parser.add_argument(
+        "pandas_range", type=str, help="The version prefix to filter by."
+    )
     args = parser.parse_args()
 
     versions = get_pandas_versions(args.pandas_range)
-    print(','.join(versions))
+    print(",".join(versions))
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -68,17 +68,27 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
-main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
-main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+main_df["CPU Usage"] = (
+    (main_df["_slow_function_call"] / total_usage) * 100.0
+).round(1)
+main_df["GPU Usage"] = (
+    (main_df["_fast_function_call"] / total_usage) * 100.0
+).round(1)
 
 total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
-pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
-pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+pr_df["CPU Usage"] = (
+    (pr_df["_slow_function_call"] / total_usage) * 100.0
+).round(1)
+pr_df["GPU Usage"] = (
+    (pr_df["_fast_function_call"] / total_usage) * 100.0
+).round(1)
 
 cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
 gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)
 
-gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
+gpu_usage_rate_change = abs(
+    pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean()
+)
 pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
 pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
 main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
@@ -92,8 +102,12 @@ def emoji_failed(x):
 pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
 pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
-diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
+pr_df = pr_df[
+    ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
+]
+diff_df = diff_df[
+    ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
+]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
 diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)

diff --git a/ci/utils/nbtestlog2junitxml.py b/ci/utils/nbtestlog2junitxml.py
@@ -1,15 +1,16 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 # Generate a junit-xml file from parsing a nbtest log
 
 import re
-from xml.etree.ElementTree import Element, ElementTree
-from os import path
 import string
 from enum import Enum
-
+from os import path
+from xml.etree.ElementTree import Element, ElementTree
 
 startingPatt = re.compile(r"^STARTING: ([\w\.\-]+)$")
-skippingPatt = re.compile(r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$")
+skippingPatt = re.compile(
+    r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$"
+)
 exitCodePatt = re.compile(r"^EXIT CODE: (\d+)$")
 folderPatt = re.compile(r"^FOLDER: ([\w\.\-]+)$")
 timePatt = re.compile(r"^real\s+([\d\.ms]+)$")
@@ -37,12 +38,8 @@ def makeFailureElement(outputLines):
 
 
 def setFileNameAttr(attrDict, fileName):
-    attrDict.update(file=fileName,
-                    classname="",
-                    line="",
-                    name="",
-                    time=""
-                   )
+    attrDict.update(file=fileName, classname="", line="", name="", time="")
+
 
 def setClassNameAttr(attrDict, className):
     attrDict["classname"] = className
@@ -76,11 +73,12 @@ def parseLog(logFile, testSuiteElement):
         testSuiteElement.attrib["timestamp"] = ""
 
         attrDict = {}
-        #setFileNameAttr(attrDict, logFile)
+        # setFileNameAttr(attrDict, logFile)
         setFileNameAttr(attrDict, "nbtest")
 
-        parserStateEnum = Enum("parserStateEnum",
-                               "newTest startingLine finishLine exitCode")
+        parserStateEnum = Enum(
+            "parserStateEnum", "newTest startingLine finishLine exitCode"
+        )
         parserState = parserStateEnum.newTest
 
         testOutput = ""
@@ -98,7 +96,9 @@ def parseLog(logFile, testSuiteElement):
                     setTimeAttr(attrDict, "0m0s")
                     skippedElement = makeTestCaseElement(attrDict)
                     message = m.group(3) or ""
-                    skippedElement.append(Element("skipped", message=message, type=""))
+                    skippedElement.append(
+                        Element("skipped", message=message, type="")
+                    )
                     testSuiteElement.append(skippedElement)
                     incrNumAttr(testSuiteElement, "skipped")
                     incrNumAttr(testSuiteElement, "tests")
@@ -160,4 +160,6 @@ def parseLog(logFile, testSuiteElement):
     testSuiteElement = Element("testsuite", name="nbtest", hostname="")
     parseLog(sys.argv[1], testSuiteElement)
     testSuitesElement.append(testSuiteElement)
-    ElementTree(testSuitesElement).write(sys.argv[1]+".xml", xml_declaration=True)
+    ElementTree(testSuitesElement).write(
+        sys.argv[1] + ".xml", xml_declaration=True
+    )
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -428,7 +428,10 @@ ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
 # ##################################################################################################
 # * rolling benchmark
 # ---------------------------------------------------------------------------------
-ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
+ConfigureNVBench(
+  ROLLING_NVBENCH rolling/grouped_range_rolling_sum.cu rolling/grouped_rolling_sum.cpp
+  rolling/range_rolling_sum.cu rolling/rolling_sum.cpp
+)
 
 add_custom_target(
   run_benchmarks

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -97,7 +97,6 @@ void BM_parquet_read_data(nvbench::state& state,
 void BM_parquet_read_long_strings(nvbench::state& state)
 {
   auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
 
   auto const d_type      = get_type_or_group(static_cast<int32_t>(data_type::STRING));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
@@ -106,14 +105,15 @@ void BM_parquet_read_long_strings(nvbench::state& state)
 
   auto const avg_string_length = static_cast<cudf::size_type>(state.get_int64("avg_string_length"));
   // corresponds to 3 sigma (full width 6 sigma: 99.7% of range)
-  auto const half_width = static_cast<cudf::size_type>(state.get_int64("half_width_string_length"));
+  auto const half_width =
+    avg_string_length >> 3;  // 32 +/- 4, 128 +/- 16, 1024 +/- 128, 8k +/- 1k, etc.
   auto const length_min = avg_string_length - half_width;
   auto const length_max = avg_string_length + half_width;
 
   data_profile profile =
     data_profile_builder()
       .cardinality(cardinality)
-      .avg_run_length(run_length)
+      .avg_run_length(1)
       .distribution(data_type::STRING, distribution_id::NORMAL, length_min, length_max);
 
   auto const num_rows_written = [&]() {
@@ -414,6 +414,5 @@ NVBENCH_BENCH(BM_parquet_read_long_strings)
   .add_string_axis("io_type", {"DEVICE_BUFFER"})
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32})
-  .add_int64_axis("avg_string_length", {16, 48, 96})
-  .add_int64_axis("half_width_string_length", {16, 32, 64});  // length = avg +/- half_width
+  .add_int64_power_of_two_axis("avg_string_length",
+                               nvbench::range(4, 16, 2));  // 16, 64, ... -> 64k
diff --git a/cpp/benchmarks/rolling/grouped_range_rolling_sum.cu b/cpp/benchmarks/rolling/grouped_range_rolling_sum.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+void bench_grouped_range_rolling_sum(nvbench::state& state)
+{
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  // Configurable parameter is window range.
+  // Since orderby column is approximately equally spaced at unit
+  // intervals, this approximately controls the number of entries in
+  // the window.
+  auto const preceding_range = cudf::numeric_scalar<cudf::size_type>{
+    static_cast<cudf::size_type>(state.get_int64("preceding_range") * 1000), true};
+  auto const following_range = cudf::numeric_scalar<cudf::size_type>{
+    static_cast<cudf::size_type>(state.get_int64("preceding_range") * 1000), true};
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  auto vals = [&] {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<std::int32_t>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<std::int32_t>(), row_count{num_rows}, profile);
+  }();
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<cudf::size_type>(), distribution_id::UNIFORM, 0, num_rows);
+    auto keys =
+      create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{num_rows}, profile);
+    return cudf::sort(cudf::table_view{{keys->view()}});
+  }();
+  auto orderby = [&] {
+    auto seq =
+      cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()}, num_rows);
+    // Equally spaced rows separated by 1000 unit intervals
+    thrust::tabulate(
+      rmm::exec_policy(cudf::get_default_stream()),
+      seq->mutable_view().begin<cudf::size_type>(),
+      seq->mutable_view().end<cudf::size_type>(),
+      [] __device__(cudf::size_type i) { return static_cast<cudf::size_type>(i) * 1000; });
+    // Add some units of noise
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<cudf::duration_ms>(), distribution_id::NORMAL, -2000, 2000);
+    profile.set_null_probability(has_nulls ? std::optional<double>{400.0 / num_rows}
+                                           : std::nullopt);
+    auto noise =
+      create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{num_rows}, profile);
+    auto result =
+      cudf::binary_operation(seq->view(), noise->view(), cudf::binary_operator::ADD, seq->type());
+    auto columns = cudf::sort_by_key(cudf::table_view{{result->view()}},
+                                     cudf::table_view{{keys->get_column(0).view(), result->view()}},
+                                     {cudf::order::ASCENDING, cudf::order::ASCENDING},
+                                     {cudf::null_order::AFTER, cudf::null_order::AFTER})
+                     ->release();
+    return std::move(columns[0]);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::grouped_range_rolling_window(keys->view(),
+                                         orderby->view(),
+                                         cudf::order::ASCENDING,
+                                         vals->view(),
+                                         cudf::range_window_bounds::get(preceding_range),
+                                         cudf::range_window_bounds::get(following_range),
+                                         1,
+                                         *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH(bench_grouped_range_rolling_sum)
+  .set_name("range_grouped_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_range", {100})
+  .add_int64_axis("following_range", {100})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
+
+NVBENCH_BENCH(bench_grouped_range_rolling_sum)
+  .set_name("range_grouped_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_range", {10'000, 40'000})
+  .add_int64_axis("following_range", {0})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("cardinality", {100});