Merge branch 'branch-25.04' into cudf-polars-multi-groupby

rapidsai · Jan 29, 2025 · a7cd29f · a7cd29f
2 parents f5205bd + 43fc535
commit a7cd29f
Show file tree

Hide file tree

Showing 164 changed files with 2,039 additions and 833 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -18,11 +18,11 @@ java/              @rapidsai/cudf-java-codeowners
 #CI code owners
 /.github/                @rapidsai/ci-codeowners
 /ci/                     @rapidsai/ci-codeowners
-/.pre-commit-config.yaml @rapidsai/ci-codeowners
 
 #packaging code owners
-/.devcontainer/    @rapidsai/packaging-codeowners
-/conda/            @rapidsai/packaging-codeowners
-/dependencies.yaml @rapidsai/packaging-codeowners
-/build.sh          @rapidsai/packaging-codeowners
-pyproject.toml     @rapidsai/packaging-codeowners
+/.pre-commit-config.yaml @rapidsai/packaging-codeowners
+/.devcontainer/          @rapidsai/packaging-codeowners
+/conda/                  @rapidsai/packaging-codeowners
+dependencies.yaml        @rapidsai/packaging-codeowners
+/build.sh                @rapidsai/packaging-codeowners
+pyproject.toml           @rapidsai/packaging-codeowners
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -90,7 +90,7 @@ jobs:
       package-name: libcudf
       package-type: cpp
   wheel-build-pylibcudf:
-    needs: [wheel-publish-libcudf]
+    needs: [wheel-build-libcudf]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
@@ -111,7 +111,7 @@ jobs:
       package-name: pylibcudf
       package-type: python
   wheel-build-cudf:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
@@ -132,7 +132,7 @@ jobs:
       package-name: cudf
       package-type: python
   wheel-build-dask-cudf:
-    needs: wheel-publish-cudf
+    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
@@ -155,7 +155,7 @@ jobs:
       package-name: dask_cudf
       package-type: python
   wheel-build-cudf-polars:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -41,13 +41,6 @@ repos:
                "python/cudf_polars/cudf_polars",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
-  - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.9.1
-    hooks:
-      - id: nbqa-isort
-        # Use the cudf_kafka isort orderings in notebooks so that dask
-        # and RAPIDS packages have their own sections.
-        args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
@@ -153,15 +146,13 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.0
+    rev: v0.9.3
     hooks:
       - id: ruff
         args: ["--fix"]
-        files: python/.*$
       - id: ruff-format
-        files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.4.0
+    rev: v0.5.0
     hooks:
       - id: verify-copyright
         exclude: |
@@ -172,11 +163,19 @@ repos:
             cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
           )
       - id: verify-alpha-spec
+      - id: verify-codeowners
+        args: [--fix, --project-prefix=cudf]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.16.0
+    rev: v1.17.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+        args: ["--severity=warning"]
+        files: ^ci/
 
 default_language_version:
       python: python3
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -3,8 +3,10 @@
 
 set -euo pipefail
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+export RAPIDS_VERSION
+export RAPIDS_VERSION_MAJOR_MINOR
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -33,15 +35,16 @@ rapids-mamba-retry install \
   "cudf=${RAPIDS_VERSION}" \
   "dask-cudf=${RAPIDS_VERSION}"
 
-export RAPIDS_DOCS_DIR="$(mktemp -d)"
+RAPIDS_DOCS_DIR="$(mktemp -d)"
+export RAPIDS_DOCS_DIR
 
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/"${RAPIDS_VERSION_MAJOR_MINOR}"/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/cudf"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 # Downloads libcudf and pylibcudf wheels from this current build,
 # then ensures 'cudf' wheel builds always use the 'libcudf' and 'pylibcudf' just built in the same CI run.

diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -8,5 +8,5 @@ package_dir="python/cudf_polars"
 ./ci/build_wheel.sh cudf-polars ${package_dir}
 ./ci/validate_wheel.sh ${package_dir} dist
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -8,5 +8,5 @@ package_dir="python/dask_cudf"
 ./ci/build_wheel.sh dask-cudf ${package_dir}
 ./ci/validate_wheel.sh ${package_dir} dist
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
@@ -6,7 +6,7 @@ set -euo pipefail
 package_name="libcudf"
 package_dir="python/libcudf"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 rapids-logger "Generating build requirements"
 

diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/pylibcudf"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 # Downloads libcudf wheel from this current build,
 # then ensures 'pylibcudf' wheel builds always use the 'libcudf' just built in the same CI run.

diff --git a/ci/check_style.sh b/ci/check_style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -20,8 +20,8 @@ RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 
 FORMAT_FILE_URL="https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json"
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
-mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
-wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
+mkdir -p "$(dirname "${RAPIDS_CMAKE_FORMAT_FILE}")"
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} "${FORMAT_FILE_URL}"
 
 # Run pre-commit checks
 pre-commit run --all-files --show-diff-on-failure
diff --git a/ci/checks/doxygen.sh b/ci/checks/doxygen.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 ###############################
 # cuDF doxygen warnings check #
 ###############################
@@ -14,25 +14,27 @@ fi
 function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; }
 
 # doxygen supported version 1.9.1
-DOXYGEN_VERSION=`doxygen --version`
-if [ ! $(version "$DOXYGEN_VERSION") -eq $(version "1.9.1") ] ; then
+DOXYGEN_VERSION=$(doxygen --version)
+if [ ! "$(version "$DOXYGEN_VERSION")" -eq "$(version "1.9.1")" ] ; then
   echo -e "warning: Unsupported doxygen version $DOXYGEN_VERSION"
   echo -e "Expecting doxygen version 1.9.1"
   exit 0
 fi
 
 # Set variables for doxygen
 # We can't use gha-tools' rapids-version and rapids-version-major-minor here because this script can run outside of CI
-export RAPIDS_VERSION="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/" VERSION)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2/" VERSION)"
+RAPIDS_VERSION="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/" VERSION)"
+RAPIDS_VERSION_MAJOR_MINOR="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2/" VERSION)"
+export RAPIDS_VERSION
+export RAPIDS_VERSION_MAJOR_MINOR
 
 # Run doxygen, ignore missing tag files error
 TAG_ERROR1="error: Tag file '.*.tag' does not exist or is not a file. Skipping it..."
 TAG_ERROR2="error: cannot open tag file .*.tag for writing"
-DOXYGEN_STDERR=`cd cpp/doxygen && { cat Doxyfile ; echo QUIET = YES; echo GENERATE_HTML = NO; }  | doxygen - 2>&1 | sed "/\($TAG_ERROR1\|$TAG_ERROR2\)/d"`
+DOXYGEN_STDERR=$(cd cpp/doxygen && { cat Doxyfile ; echo QUIET = YES; echo GENERATE_HTML = NO; }  | doxygen - 2>&1 | sed "/\($TAG_ERROR1\|$TAG_ERROR2\)/d")
 RETVAL=$?
 
-if [ "$RETVAL" != "0" ] || [ ! -z "$DOXYGEN_STDERR" ]; then
+if [ "$RETVAL" != "0" ] || [ -n "$DOXYGEN_STDERR" ]; then
   echo -e "$DOXYGEN_STDERR"
   RETVAL=1 #because return value is not generated by doxygen 1.8.20
 fi

diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -20,8 +20,6 @@ set +u
 conda activate clang_tidy
 set -u
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-
 source rapids-configure-sccache
 
 # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.

diff --git a/ci/cudf_pandas_scripts/fetch_pandas_versions.py b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
@@ -1,24 +1,34 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+
+import argparse
 
 import requests
-from packaging.version import Version
 from packaging.specifiers import SpecifierSet
-import argparse
+from packaging.version import Version
+
 
 def get_pandas_versions(pandas_range):
     url = "https://pypi.org/pypi/pandas/json"
     response = requests.get(url)
     data = response.json()
-    versions = [Version(v) for v in data['releases']]
+    versions = [Version(v) for v in data["releases"]]
     specifier = SpecifierSet(pandas_range.lstrip("pandas"))
     matching_versions = [v for v in versions if v in specifier]
-    matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version)
+    matching_minors = sorted(
+        set(".".join((str(v.major), str(v.minor))) for v in matching_versions),
+        key=Version,
+    )
     return matching_minors
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.")
-    parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.")
+    parser = argparse.ArgumentParser(
+        description="Filter pandas versions by prefix."
+    )
+    parser.add_argument(
+        "pandas_range", type=str, help="The version prefix to filter by."
+    )
     args = parser.parse_args()
 
     versions = get_pandas_versions(args.pandas_range)
-    print(','.join(versions))
+    print(",".join(versions))
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -13,7 +13,6 @@ rapids-logger "Github job name: ${GH_JOB_NAME}"
 rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
 PY_VER="310"
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
@@ -22,7 +21,7 @@ COMPARE_ENV=$(tail -n 1 s3_output.txt)
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
 
 aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json
-aws s3 cp $PR_ARTIFACT pr-results.json
+aws s3 cp "$PR_ARTIFACT" pr-results.json
 
 # Compute the diff and prepare job summary:
 python -m pip install pandas tabulate

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -68,17 +68,27 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
-main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
-main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+main_df["CPU Usage"] = (
+    (main_df["_slow_function_call"] / total_usage) * 100.0
+).round(1)
+main_df["GPU Usage"] = (
+    (main_df["_fast_function_call"] / total_usage) * 100.0
+).round(1)
 
 total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
-pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
-pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+pr_df["CPU Usage"] = (
+    (pr_df["_slow_function_call"] / total_usage) * 100.0
+).round(1)
+pr_df["GPU Usage"] = (
+    (pr_df["_fast_function_call"] / total_usage) * 100.0
+).round(1)
 
 cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
 gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)
 
-gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
+gpu_usage_rate_change = abs(
+    pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean()
+)
 pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
 pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
 main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
@@ -92,8 +102,12 @@ def emoji_failed(x):
 pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
 pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
-diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
+pr_df = pr_df[
+    ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
+]
+diff_df = diff_df[
+    ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
+]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
 diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)