From 0259f2f54281416b98377e8da09029994f2431dc Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 20 Jan 2025 12:35:35 -0800
Subject: [PATCH 01/35] Bump polars version to 1.20

---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml |  2 +-
 conda/recipes/cudf-polars/meta.yaml              |  2 +-
 dependencies.yaml                                |  2 +-
 .../cudf_polars/dsl/expressions/datetime.py      |  3 ++-
 .../cudf_polars/dsl/expressions/string.py        |  3 ++-
 python/cudf_polars/cudf_polars/dsl/ir.py         |  2 ++
 python/cudf_polars/cudf_polars/dsl/translate.py  |  9 +++++----
 python/cudf_polars/cudf_polars/utils/versions.py |  3 ++-
 python/cudf_polars/pyproject.toml                |  2 +-
 python/cudf_polars/tests/test_join.py            | 12 ++++++++----
 python/cudf_polars/tests/test_union.py           | 16 +---------------
 12 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index f4fd3dcfb77..a48d71b31ef 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.18
+- polars>=1.11,<1.21
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index ec7ae3f0706..b31e0526453 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.18
+- polars>=1.11,<1.21
 - pre-commit
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme>=0.15.4
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index 7a0005497df..d56dc84371f 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.18
+    - polars >=1.11,<1.21
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index 25866e85a0b..8f5dae826ed 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -767,7 +767,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.18
+          - polars>=1.11,<1.21
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index 0c3159c73d6..a145cd770f9 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 # ruff: noqa: D101
@@ -58,6 +58,7 @@ class Name(IntEnum):
         OrdinalDay = auto()
         Quarter = auto()
         ReplaceTimeZone = auto()
+        Replace = auto()
         Round = auto()
         Second = auto()
         Time = auto()
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 256840c1f3d..e51ac7977ce 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 # ruff: noqa: D101
@@ -57,6 +57,7 @@ class Name(IntEnum):
         LenBytes = auto()
         LenChars = auto()
         Lowercase = auto()
+        Normalize = auto()
         PadEnd = auto()
         PadStart = auto()
         Replace = auto()
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index fd56329a48e..fc4203ab9cc 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1257,7 +1257,9 @@ def do_evaluate(
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
+        how: str
         how, join_nulls, zlice, suffix, coalesce, _ = options
+        how = how.lower()
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 2138ac0c700..32c96226e70 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (4, 3):
+        if (version := self.visitor.version()) >= (5, 1):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
@@ -308,7 +308,8 @@ def _(
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)
         right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
-    if (how := node.options[0]) in {
+    how: str | tuple = node.options[0]
+    if isinstance(how, str) and how.lower() in {
         "inner",
         "left",
         "right",
@@ -319,8 +320,8 @@ def _(
     }:
         return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
     else:
-        how, op1, op2 = how
-        if how != "ie_join":
+        how, op1, op2 = node.options[0]
+        if how not in {"ie_join", "IEJoin"}:
             raise NotImplementedError(
                 f"Unsupported join type {how}"
             )  # pragma: no cover; asof joins not yet exposed
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index b08cede8f7f..2cc1c189408 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Version utilities so that cudf_polars supports a range of polars versions."""
@@ -16,6 +16,7 @@
 POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12")
 POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12")
 POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13")
+POLARS_VERSION_LT_119 = POLARS_VERSION < parse("1.19")
 
 
 def _ensure_polars_version():
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 9fb9bbf391e..b5ec6c7a0a9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.18",
+    "polars>=1.11,<1.21",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index f1f47bfb9f1..642bb420b5f 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -13,7 +13,11 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113
+from cudf_polars.utils.versions import (
+    POLARS_VERSION_LT_112,
+    POLARS_VERSION_LT_113,
+    POLARS_VERSION_LT_119,
+)
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -118,6 +122,7 @@ def test_cross_join(left, right, zlice):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.xfail(POLARS_VERSION_LT_119, reason="Not supported until polars==1.19")
 @pytest.mark.parametrize(
     "left_on,right_on",
     [
@@ -125,10 +130,9 @@ def test_cross_join(left, right, zlice):
         (pl.lit(2, dtype=pl.Int64), pl.col("a")),
     ],
 )
-def test_join_literal_key_unsupported(left, right, left_on, right_on):
+def test_join_literal_key(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
-
-    assert_ir_translation_raises(q, NotImplementedError)
+    assert_gpu_result_equal(q)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 865b95a7d91..de75900f8c0 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -6,7 +6,6 @@
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
-    assert_ir_translation_raises,
 )
 
 
@@ -22,19 +21,6 @@ def test_union():
     assert_gpu_result_equal(query)
 
 
-def test_union_schema_mismatch_raises():
-    ldf = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6, 7],
-            "b": [1, 1, 1, 1, 1, 1, 1],
-        }
-    ).lazy()
-    ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
-    query = pl.concat([ldf, ldf2], how="diagonal")
-
-    assert_ir_translation_raises(query, NotImplementedError)
-
-
 def test_concat_vertical():
     ldf = pl.LazyFrame(
         {

From d851729b4f0c0ca71a4117d6dc4359c6931469be Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 21 Jan 2025 11:22:34 -0800
Subject: [PATCH 02/35] polars defaults to int32 literals

---
 .../cudf_polars/cudf_polars/dsl/translate.py  | 31 +++++++++++++++++--
 python/cudf_polars/tests/test_join.py         | 10 ++++--
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 32c96226e70..4bf8c6b47ee 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -302,12 +302,39 @@ def _(
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
+    def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
+        arrow_type = plc.interop.to_arrow(literal.dtype)
+        if arrow_type == pa.int32():
+            new_arrow_type = pa.int64()
+            new_dtype = plc.interop.from_arrow(new_arrow_type)
+            new_value = pa.scalar(literal.value.as_py(), type=new_arrow_type)
+            return expr.Literal(new_dtype, new_value)
+        return literal
+
+    def maybe_adjust_binop(e) -> None:
+        if not isinstance(e.value, expr.BinOp):
+            return
+
+        left, right = e.value.children
+
+        if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
+            e.value.children = (left, adjust_literal_dtype(right))
+
+        elif isinstance(left, expr.Literal) and isinstance(right, expr.Col):
+            e.value.children = (adjust_literal_dtype(left), right)
+
+    def translate_expr_and_maybe_fix_binop_args(translator, exprs):
+        translated = [translate_named_expr(translator, n=e) for e in exprs]
+        for t in translated:
+            maybe_adjust_binop(t)
+        return translated
+
     with set_node(translator.visitor, node.input_left):
         inp_left = translator.translate_ir(n=None)
-        left_on = [translate_named_expr(translator, n=e) for e in node.left_on]
+        left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)
-        right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
+        right_on = translate_expr_and_maybe_fix_binop_args(translator, node.right_on)
     how: str | tuple = node.options[0]
     if isinstance(how, str) and how.lower() in {
         "inner",
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 642bb420b5f..bf5d976716d 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -30,7 +30,7 @@ def how(request):
     return request.param
 
 
-@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
+@pytest.fixture(params=[(1, 5), (1, None), (0, 2), (0, None)])
 def zlice(request):
     return request.param
 
@@ -141,7 +141,13 @@ def test_join_literal_key(left, right, left_on, right_on):
         [pl.col("a") < pl.col("a_right")],
         [pl.col("a_right") <= pl.col("a") * 2],
         [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
-        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        pytest.param(
+            [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+            marks=pytest.mark.xfail(
+                not POLARS_VERSION_LT_119,
+                reason="https://github.com/pola-rs/polars/issues/20831",
+            ),
+        ),
         pytest.param(
             [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
             marks=pytest.mark.xfail(

From 4dac5d93034864ea7bdddd58b02819b0aa57816d Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 21 Jan 2025 11:35:28 -0800
Subject: [PATCH 03/35] xfail test

---
 python/cudf_polars/tests/test_union.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index de75900f8c0..d387e9ff344 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,11 +2,15 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pytest
+
 import polars as pl
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
+    assert_ir_translation_raises,
 )
+from cudf_polars.utils.versions import POLARS_VERSION_LT_119
 
 
 def test_union():
@@ -21,6 +25,20 @@ def test_union():
     assert_gpu_result_equal(query)
 
 
+@pytest.mark.xfail(not POLARS_VERSION_LT_119, reason="query now fails in polars>=1.19")
+def test_union_schema_mismatch_raises():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
+    query = pl.concat([ldf, ldf2], how="diagonal")
+
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
 def test_concat_vertical():
     ldf = pl.LazyFrame(
         {

From 0c93f05f54a3bb05614149bfa24bae7ae02a274c Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 21 Jan 2025 11:47:19 -0800
Subject: [PATCH 04/35] dont xfail test

---
 python/cudf_polars/cudf_polars/testing/plugin.py | 5 ++++-
 python/cudf_polars/cudf_polars/utils/versions.py | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index e453a8b89b9..c6ea844ba31 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -14,6 +14,8 @@
 
 import polars
 
+from cudf_polars.utils.versions import POLARS_VERSION_LT_120
+
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
@@ -196,7 +198,8 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
     "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
         "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
-        version.parse(fastexcel.__version__) >= version.parse("0.12.1"),
+        version.parse(fastexcel.__version__) >= version.parse("0.12.1")
+        and POLARS_VERSION_LT_120,
     ),
 }
 
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 2cc1c189408..4699c08ba6b 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -17,6 +17,7 @@
 POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12")
 POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13")
 POLARS_VERSION_LT_119 = POLARS_VERSION < parse("1.19")
+POLARS_VERSION_LT_120 = POLARS_VERSION < parse("1.20")
 
 
 def _ensure_polars_version():

From 08db6c9fec7e546aadbd56136931d2c6fbc5fa8d Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 21 Jan 2025 14:58:02 -0800
Subject: [PATCH 05/35] address test failures

---
 python/cudf_polars/cudf_polars/dsl/ir.py         |  5 -----
 python/cudf_polars/cudf_polars/dsl/translate.py  | 12 ++++++------
 python/cudf_polars/cudf_polars/testing/plugin.py |  3 ++-
 python/cudf_polars/tests/test_join.py            |  2 +-
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index fc4203ab9cc..28879094590 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1142,11 +1142,6 @@ def __init__(
         # TODO: Implement maintain_order
         if options[5] != "none":
             raise NotImplementedError("maintain_order not implemented yet")
-        if any(
-            isinstance(e.value, expr.Literal)
-            for e in itertools.chain(self.left_on, self.right_on)
-        ):
-            raise NotImplementedError("Join with literal as join key.")
 
     @staticmethod
     @cache
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 4bf8c6b47ee..4e79779fa85 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -303,12 +303,12 @@ def _(
     # right inputs, so these must be translated with the relevant
     # input active.
     def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
-        arrow_type = plc.interop.to_arrow(literal.dtype)
-        if arrow_type == pa.int32():
-            new_arrow_type = pa.int64()
-            new_dtype = plc.interop.from_arrow(new_arrow_type)
-            new_value = pa.scalar(literal.value.as_py(), type=new_arrow_type)
-            return expr.Literal(new_dtype, new_value)
+        if literal.dtype.id() == plc.types.TypeId.INT32:
+            plc_int64 = plc.types.DataType(plc.types.TypeId.INT64)
+            return expr.Literal(
+                plc_int64,
+                pa.scalar(literal.value.as_py(), type=plc.interop.to_arrow(plc_int64)),
+            )
         return literal
 
     def maybe_adjust_binop(e) -> None:
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index c6ea844ba31..d87d90373f2 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -236,4 +236,5 @@ def pytest_collection_modifyitems(
                         reason=EXPECTED_FAILURES[item.nodeid][0],
                     ),
                 )
-            item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
+            else:
+                item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index bf5d976716d..abb656d74ca 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -139,7 +139,7 @@ def test_join_literal_key(left, right, left_on, right_on):
     "conditions",
     [
         [pl.col("a") < pl.col("a_right")],
-        [pl.col("a_right") <= pl.col("a") * 2],
+        [pl.col("a_right") <= pl.col("a") * 2, pl.col("a_right") <= 2 * pl.col("a")],
         [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
         pytest.param(
             [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],

From ef9964b55efff75875ccc2f804ca7ce0cda5a6cf Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 21 Jan 2025 16:02:29 -0800
Subject: [PATCH 06/35] clean up

---
 python/cudf_polars/cudf_polars/testing/plugin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index d87d90373f2..2d916b45611 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -237,4 +237,6 @@ def pytest_collection_modifyitems(
                     ),
                 )
             else:
-                item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
+                item.add_marker(
+                    pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])
+                )

From 26db35da6f0c78d4280a2f8d678ca85acb44403a Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Tue, 21 Jan 2025 21:19:42 -0800
Subject: [PATCH 07/35] xfail more tests

---
 python/cudf_polars/cudf_polars/dsl/ir.py      | 21 ++++++++++-----
 .../cudf_polars/cudf_polars/dsl/translate.py  | 26 ++++++++-----------
 .../cudf_polars/cudf_polars/testing/plugin.py | 24 +++++++++++++++++
 3 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 28879094590..408cced78d3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -764,8 +764,10 @@ def do_evaluate(
             for c, dtype in zip(df.columns, schema.values(), strict=True)
         )
         if predicate is not None:
-            (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows)
-            return df.filter(mask)
+            (mask,) = broadcast(
+                predicate.evaluate(df), target_length=df.num_rows
+            )  # pragma: no cover
+            return df.filter(mask)  # pragma: no cover
         else:
             return df
 
@@ -1180,7 +1182,7 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        assert_never(how)
+        assert_never(how)  # pragma: no cover
 
     @staticmethod
     def _reorder_maps(
@@ -1312,10 +1314,15 @@ def do_evaluate(
             )
             if coalesce and how != "inner":
                 left = left.with_columns(
-                    (
+                    tuple(
                         Column(
-                            plc.replace.replace_nulls(left_col.obj, right_col.obj),
-                            name=left_col.name,
+                            plc.replace.replace_nulls(
+                                left_col.obj,
+                                right_col.obj.astype(left_col.obj.type())
+                                if left_col.obj.type().id() != right_col.obj.type().id()
+                                else right_col.obj,
+                            ),
+                            name=left_col.name or right_col.name,
                         )
                         for left_col, right_col in zip(
                             left.select_columns(left_on.column_names_set),
@@ -1767,7 +1774,7 @@ def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR)
         self.children = children
         schema = self.children[0].schema
         if not all(s.schema == schema for s in self.children[1:]):
-            raise NotImplementedError("Schema mismatch")
+            raise NotImplementedError("Schema mismatch")  # pragma: no cover
 
     @classmethod
     def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 4e79779fa85..4a8c43638f8 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -311,23 +311,19 @@ def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
             )
         return literal
 
-    def maybe_adjust_binop(e) -> None:
-        if not isinstance(e.value, expr.BinOp):
-            return
-
-        left, right = e.value.children
-
-        if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
-            e.value.children = (left, adjust_literal_dtype(right))
-
-        elif isinstance(left, expr.Literal) and isinstance(right, expr.Col):
-            e.value.children = (adjust_literal_dtype(left), right)
+    def maybe_adjust_binop(e) -> expr.Expr:
+        if isinstance(e.value, expr.BinOp):
+            left, right = e.value.children
+            if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
+                e.value.children = (left, adjust_literal_dtype(right))
+            elif isinstance(left, expr.Literal) and isinstance(right, expr.Col):
+                e.value.children = (adjust_literal_dtype(left), right)
+        return e
 
     def translate_expr_and_maybe_fix_binop_args(translator, exprs):
-        translated = [translate_named_expr(translator, n=e) for e in exprs]
-        for t in translated:
-            maybe_adjust_binop(t)
-        return translated
+        return [
+            maybe_adjust_binop(translate_named_expr(translator, n=e)) for e in exprs
+        ]
 
     with set_node(translator.visitor, node.input_left):
         inp_left = translator.translate_ir(n=None)
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 2d916b45611..e57540193a5 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -183,6 +183,30 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int64']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int64']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int32']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int32']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int16']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int16']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int8']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int8']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int128']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int128']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int64']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int64']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int32']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int32']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int16']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int16']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int8']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int8']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt32', 'Int128']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt32', 'Int128']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt16', 'Int128']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt16', 'Int128']-swap=False]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt8', 'Int128']-swap=True]": "casting int128 not supported",
+    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt8', 'Int128']-swap=False]": "casting int128 not supported",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",

From 8057f369fb1896a73c37aa62775ada5370f5e9c1 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Wed, 22 Jan 2025 09:38:31 -0800
Subject: [PATCH 08/35] xfail more tests

---
 python/cudf_polars/cudf_polars/dsl/ir.py         | 2 +-
 python/cudf_polars/cudf_polars/testing/plugin.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 408cced78d3..c05aba71439 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1318,7 +1318,7 @@ def do_evaluate(
                         Column(
                             plc.replace.replace_nulls(
                                 left_col.obj,
-                                right_col.obj.astype(left_col.obj.type())
+                                right_col.astype(left_col.obj.type()).obj
                                 if left_col.obj.type().id() != right_col.obj.type().id()
                                 else right_col.obj,
                             ),
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index e57540193a5..bade1d0387a 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -180,8 +180,10 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_join.py::test_join_lit_panic_11410": "no join ordering is preserved",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
     "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int64']-swap=True]": "casting int128 not supported",
     "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int64']-swap=False]": "casting int128 not supported",
@@ -212,8 +214,12 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
     "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
+    "tests/unit/sql/test_literals.py::test_dollar_quoted_literals": "Empty polars schema from IR",
+    "tests/unit/sql/test_literals.py::test_intervals": "Empty polars schema from IR",
+    "tests/unit/sql/test_literals.py::test_select_literals_no_table": "Empty polars schema from IR",
     "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "Remove after https://github.com/pola-rs/polars/issues/20852 is resolved",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",

From 01ab676c9acbc9ae845240e1523376d118025cea Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Fri, 24 Jan 2025 07:14:44 -0800
Subject: [PATCH 09/35] address reviews

---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf-polars/meta.yaml           |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 24 +++++------------
 .../cudf_polars/cudf_polars/dsl/translate.py  | 20 +++++++-------
 .../cudf_polars/cudf_polars/testing/plugin.py | 16 +++---------
 .../cudf_polars/cudf_polars/utils/versions.py |  8 ++----
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/test_join.py         | 26 +++----------------
 python/cudf_polars/tests/test_union.py        | 18 -------------
 11 files changed, 29 insertions(+), 93 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ac8946245c5..aa12402a10b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -67,7 +67,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.21
+- polars>=1.20,<1.21
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 5a529f549b7..132a20a8d70 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -65,7 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.21
+- polars>=1.20,<1.21
 - pre-commit
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme>=0.15.4
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index d56dc84371f..c077d5e9417 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.21
+    - polars >=1.20,<1.21
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index 9146e59d050..e41f30157eb 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -777,7 +777,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.21
+          - polars>=1.20,<1.21
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index c05aba71439..fa0975b2bb2 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -31,7 +31,6 @@
 from cudf_polars.dsl.nodebase import Node
 from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
 from cudf_polars.utils import dtypes
-from cudf_polars.utils.versions import POLARS_VERSION_GT_112
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
@@ -628,12 +627,7 @@ def slice_skip(tbl: plc.Table):
             )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
-            if POLARS_VERSION_GT_112:
-                # If we sliced away some data from the start, that
-                # shifts the row index.
-                # But prior to 1.13, polars had this wrong, so we match behaviour
-                # https://github.com/pola-rs/polars/issues/19607
-                offset += skip_rows
+            offset += skip_rows
             dtype = schema[name]
             step = plc.interop.from_arrow(
                 pa.scalar(1, type=plc.interop.to_arrow(dtype))
@@ -763,13 +757,7 @@ def do_evaluate(
             c.obj.type() == dtype
             for c, dtype in zip(df.columns, schema.values(), strict=True)
         )
-        if predicate is not None:
-            (mask,) = broadcast(
-                predicate.evaluate(df), target_length=df.num_rows
-            )  # pragma: no cover
-            return df.filter(mask)  # pragma: no cover
-        else:
-            return df
+        return df
 
 
 class Select(IR):
@@ -1314,7 +1302,7 @@ def do_evaluate(
             )
             if coalesce and how != "inner":
                 left = left.with_columns(
-                    tuple(
+                    (
                         Column(
                             plc.replace.replace_nulls(
                                 left_col.obj,
@@ -1378,7 +1366,9 @@ def do_evaluate(
         """Evaluate and return a dataframe."""
         columns = [c.evaluate(df) for c in exprs]
         if should_broadcast:
-            columns = broadcast(*columns, target_length=df.num_rows)
+            columns = broadcast(
+                *columns, target_length=df.num_rows if df.num_columns != 0 else None
+            )
         else:
             # Polars ensures this is true, but let's make sure nothing
             # went wrong. In this case, the parent node is a
@@ -1773,8 +1763,6 @@ def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR)
         self._non_child_args = (zlice,)
         self.children = children
         schema = self.children[0].schema
-        if not all(s.schema == schema for s in self.children[1:]):
-            raise NotImplementedError("Schema mismatch")  # pragma: no cover
 
     @classmethod
     def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 4a8c43638f8..b6353f1ac9c 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -331,20 +331,20 @@ def translate_expr_and_maybe_fix_binop_args(translator, exprs):
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)
         right_on = translate_expr_and_maybe_fix_binop_args(translator, node.right_on)
-    how: str | tuple = node.options[0]
-    if isinstance(how, str) and how.lower() in {
-        "inner",
-        "left",
-        "right",
-        "full",
-        "cross",
-        "semi",
-        "anti",
+
+    if (how := node.options[0]) in {
+        "Inner",
+        "Left",
+        "Right",
+        "Full",
+        "Cross",
+        "Semi",
+        "Anti",
     }:
         return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
     else:
         how, op1, op2 = node.options[0]
-        if how not in {"ie_join", "IEJoin"}:
+        if how != "IEJoin":
             raise NotImplementedError(
                 f"Unsupported join type {how}"
             )  # pragma: no cover; asof joins not yet exposed
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index bade1d0387a..925721438c1 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -8,14 +8,10 @@
 from functools import partialmethod
 from typing import TYPE_CHECKING
 
-import fastexcel
 import pytest
-from packaging import version
 
 import polars
 
-from cudf_polars.utils.versions import POLARS_VERSION_LT_120
-
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
@@ -214,23 +210,14 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values5-values::int4-conversion from `str` to `i32` failed]": "Cast raises, but error user receives is wrong",
     "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
-    "tests/unit/sql/test_literals.py::test_dollar_quoted_literals": "Empty polars schema from IR",
-    "tests/unit/sql/test_literals.py::test_intervals": "Empty polars schema from IR",
-    "tests/unit/sql/test_literals.py::test_select_literals_no_table": "Empty polars schema from IR",
     "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
-    "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "Remove after https://github.com/pola-rs/polars/issues/20852 is resolved",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
-    "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
-        "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
-        version.parse(fastexcel.__version__) >= version.parse("0.12.1")
-        and POLARS_VERSION_LT_120,
-    ),
 }
 
 
@@ -244,6 +231,9 @@ def pytest_configure(config: pytest.Config) -> None:
     # polars that the requested timezone is unknown.
     # Since this is random, just skip it, rather than xfailing.
     "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
+    # The test may segfault with the legacy streaming engine. We should
+    # remove this skip when all polars tests use the new streaming engine.
+    "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
 }
 
 
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 4699c08ba6b..85875a8753d 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,16 +12,12 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11")
-POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12")
-POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12")
-POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13")
 POLARS_VERSION_LT_119 = POLARS_VERSION < parse("1.19")
 POLARS_VERSION_LT_120 = POLARS_VERSION < parse("1.20")
 
 
 def _ensure_polars_version():
-    if POLARS_VERSION_LT_111:
+    if POLARS_VERSION_LT_120:
         raise ImportError(
-            "cudf_polars requires py-polars v1.11 or greater."
+            "cudf_polars requires py-polars v1.20 or greater."
         )  # pragma: no cover
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index b5ec6c7a0a9..30f68ae4de4 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.21",
+    "polars>=1.20,<1.21",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index abb656d74ca..efcebb45a75 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -7,17 +7,12 @@
 import pytest
 
 import polars as pl
-from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils.versions import (
-    POLARS_VERSION_LT_112,
-    POLARS_VERSION_LT_113,
-    POLARS_VERSION_LT_119,
-)
+from cudf_polars.utils.versions import POLARS_VERSION_LT_119
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -30,7 +25,7 @@ def how(request):
     return request.param
 
 
-@pytest.fixture(params=[(1, 5), (1, None), (0, 2), (0, None)])
+@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
 def zlice(request):
     return request.param
 
@@ -100,15 +95,7 @@ def test_left_join_with_slice(left, right, join_nulls, zlice):
     q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
     ctx = nullcontext()
     if zlice is not None:
-        q_expect = q.collect().slice(*zlice)
         q = q.slice(*zlice)
-        if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)):
-            # https://github.com/pola-rs/polars/issues/19403
-            # https://github.com/pola-rs/polars/issues/19405
-            ctx = pytest.raises(AssertionError)
-            assert_frame_equal(
-                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
-            )
 
     with ctx:
         assert_gpu_result_equal(q)
@@ -122,7 +109,6 @@ def test_cross_join(left, right, zlice):
     assert_gpu_result_equal(q)
 
 
-@pytest.mark.xfail(POLARS_VERSION_LT_119, reason="Not supported until polars==1.19")
 @pytest.mark.parametrize(
     "left_on,right_on",
     [
@@ -148,13 +134,7 @@ def test_join_literal_key(left, right, left_on, right_on):
                 reason="https://github.com/pola-rs/polars/issues/20831",
             ),
         ),
-        pytest.param(
-            [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
-            marks=pytest.mark.xfail(
-                POLARS_VERSION_LT_113,
-                reason="https://github.com/pola-rs/polars/issues/19597",
-            ),
-        ),
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
     ],
 )
 def test_join_where(left, right, conditions, zlice):
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index d387e9ff344..de75900f8c0 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,15 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
-    assert_ir_translation_raises,
 )
-from cudf_polars.utils.versions import POLARS_VERSION_LT_119
 
 
 def test_union():
@@ -25,20 +21,6 @@ def test_union():
     assert_gpu_result_equal(query)
 
 
-@pytest.mark.xfail(not POLARS_VERSION_LT_119, reason="query now fails in polars>=1.19")
-def test_union_schema_mismatch_raises():
-    ldf = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6, 7],
-            "b": [1, 1, 1, 1, 1, 1, 1],
-        }
-    ).lazy()
-    ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
-    query = pl.concat([ldf, ldf2], how="diagonal")
-
-    assert_ir_translation_raises(query, NotImplementedError)
-
-
 def test_concat_vertical():
     ldf = pl.LazyFrame(
         {

From c34e78528673230cfaa3c52e4bb681ce84ddaf8f Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Fri, 24 Jan 2025 12:11:46 -0800
Subject: [PATCH 10/35] fix join_where test

---
 python/cudf_polars/tests/test_join.py | 40 +++++++++++++++++++--------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index efcebb45a75..e5a22fd4f7b 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -12,7 +12,6 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils.versions import POLARS_VERSION_LT_119
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -122,22 +121,39 @@ def test_join_literal_key(left, right, left_on, right_on):
 
 
 @pytest.mark.parametrize(
-    "conditions",
+    "conditions, expr_id",
     [
-        [pl.col("a") < pl.col("a_right")],
-        [pl.col("a_right") <= pl.col("a") * 2, pl.col("a_right") <= 2 * pl.col("a")],
-        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
-        pytest.param(
+        ([pl.col("a") < pl.col("a_right")], "expr_0"),
+        (
+            [
+                pl.col("a_right") <= pl.col("a") * 2,
+                pl.col("a_right") <= 2 * pl.col("a"),
+            ],
+            "expr_1",
+        ),
+        (
+            [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+            "expr_2",
+        ),
+        (
             [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
-            marks=pytest.mark.xfail(
-                not POLARS_VERSION_LT_119,
-                reason="https://github.com/pola-rs/polars/issues/20831",
-            ),
+            "expr_3",
+        ),
+        (
+            [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+            "expr_4",
         ),
-        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
     ],
 )
-def test_join_where(left, right, conditions, zlice):
+@pytest.mark.parametrize("zlice", [None, (0, 5)])
+def test_join_where(request, left, right, conditions, zlice, expr_id):
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(expr_id == "expr_3" and zlice is not None),
+            reason="Failing due to https://github.com/pola-rs/polars/issues/20831. Remove when we upgrade to polars>1.20",
+        )
+    )
+
     q = left.join_where(right, *conditions)
 
     assert_gpu_result_equal(q, check_row_order=False)

From febeb5941cfbb31b45287cfc6ccc94980d76f904 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Fri, 24 Jan 2025 12:31:44 -0800
Subject: [PATCH 11/35] keep how Uppercase

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 30 +++++++++++-------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index fa0975b2bb2..f30246ac1cd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1097,7 +1097,7 @@ class Join(IR):
     right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
+        Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
         bool,
         tuple[int, int] | None,
         str,
@@ -1136,35 +1136,35 @@ def __init__(
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "semi", "anti"],
+        how: Literal["Inner", "Left", "Right", "Full", "Semi", "Anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
-        if how == "inner":
+        if how == "Inner":
             return (
                 plc.join.inner_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
-        elif how == "left" or how == "right":
+        elif how == "Left" or how == "Right":
             return (
                 plc.join.left_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "full":
+        elif how == "Full":
             return (
                 plc.join.full_join,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "semi":
+        elif how == "Semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "anti":
+        elif how == "Anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -1231,7 +1231,7 @@ def do_evaluate(
         left_on_exprs: Sequence[expr.NamedExpr],
         right_on_exprs: Sequence[expr.NamedExpr],
         options: tuple[
-            Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
+            Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
             bool,
             tuple[int, int] | None,
             str,
@@ -1242,10 +1242,8 @@ def do_evaluate(
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        how: str
         how, join_nulls, zlice, suffix, coalesce, _ = options
-        how = how.lower()
-        if how == "cross":
+        if how == "Cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
             columns = plc.join.cross_join(left.table, right.table).columns()
@@ -1282,17 +1280,17 @@ def do_evaluate(
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
-            if how == "right":
+            if how == "Right":
                 # Right join is a left join with the tables swapped
                 left, right = right, left
                 left_on, right_on = right_on, left_on
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            if how == "left" or how == "right":
+            if how == "Left" or how == "Right":
                 # Order of left table is preserved
                 lg, rg = cls._reorder_maps(
                     left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
                 )
-            if coalesce and how == "inner":
+            if coalesce and how == "Inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
@@ -1300,7 +1298,7 @@ def do_evaluate(
             right = DataFrame.from_table(
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
-            if coalesce and how != "inner":
+            if coalesce and how != "Inner":
                 left = left.with_columns(
                     (
                         Column(
@@ -1321,7 +1319,7 @@ def do_evaluate(
                     replace_only=True,
                 )
                 right = right.discard_columns(right_on.column_names_set)
-            if how == "right":
+            if how == "Right":
                 # Undo the swap for right join before gluing together.
                 left, right = right, left
             right = right.rename_columns(

From 922ca75dda5202744bbbe3c6fc077e2ce55ba941 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sat, 25 Jan 2025 11:11:11 -0600
Subject: [PATCH 12/35] Add support for `pyarrow-19` (#17794)

This PR upgrades the upper bound pinnings for `pyarrow` in `cudf`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17794
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 6 +++---
 python/cudf/cudf/tests/test_parquet.py           | 4 ++++
 python/cudf/pyproject.toml                       | 4 ++--
 python/pylibcudf/pyproject.toml                  | 6 +++---
 6 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 60d8e96d932..dbb44890965 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -70,7 +70,7 @@ dependencies:
 - polars>=1.11,<1.18
 - pre-commit
 - ptxcompiler
-- pyarrow>=14.0.0,<19.0.0a0
+- pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
 - pynvml>=12.0.0,<13.0.0a0
 - pytest-benchmark
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index fe1a32ccb87..1b674596a4b 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -67,7 +67,7 @@ dependencies:
 - pandoc
 - polars>=1.11,<1.18
 - pre-commit
-- pyarrow>=14.0.0,<19.0.0a0
+- pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
 - pynvjitlink>=0.0.0a0
 - pynvml>=12.0.0,<13.0.0a0
diff --git a/dependencies.yaml b/dependencies.yaml
index edd83e6e07d..54da3d98d09 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -483,7 +483,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - pyarrow>=14.0.0,<19.0.0a0
+          - pyarrow>=14.0.0,<20.0.0a0
       - output_types: [requirements, pyproject]
         packages:
           # pyarrow 17.0.0 wheels have a subtle issue around threading that
@@ -491,8 +491,8 @@ dependencies:
           # be highly dependent on the exact build configuration, so we'll just
           # avoid 17.0.0 for now unless we observe similar issues in future
           # releases as well.
-          - pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'
-          - pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'
+          - pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'
+          - pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9d5f32c7ab9..9ff2a6f0ed7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -4373,6 +4373,10 @@ def test_parquet_reader_mismatched_nullability_structs(tmpdir):
     )
 
 
+@pytest.mark.skipif(
+    pa.__version__ == "19.0.0",
+    reason="https://github.com/rapidsai/cudf/issues/17806",
+)
 @pytest.mark.parametrize(
     "stats_fname,bloom_filter_fname",
     [
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 2b03f515657..bd2a710e84a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,8 +31,8 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
-    "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
+    "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'",
     "pylibcudf==25.2.*,>=0.0.0a0",
     "rich",
     "rmm==25.2.*,>=0.0.0a0",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index e0055d5ebf8..efa3d301334 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -22,8 +22,8 @@ dependencies = [
     "libcudf==25.2.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
-    "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
+    "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'",
     "rmm==25.2.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 133e0c869531af94474e0bbb66cb22c5f8ba80f2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sat, 25 Jan 2025 11:39:46 -0600
Subject: [PATCH 13/35] Resolve race-condition in `disable_module_accelerator`
 (#17811)

Fixes: #17775

This PR fixes a race condition that arises when `disable_module_accelerator` is used in a multi-threaded setting.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17811
---
 python/cudf/cudf/pandas/module_accelerator.py | 21 ++++----
 .../data/disable_cudf_pandas_multi_thread.py  | 49 +++++++++++++++++++
 .../test_disable_pandas_accelerator.py        | 35 +++++++++++++
 python/cudf/cudf_pandas_tests/test_main.py    |  4 +-
 .../cudf/cudf_pandas_tests/test_profiler.py   |  4 +-
 5 files changed, 100 insertions(+), 13 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py
 create mode 100644 python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index 38103a71908..9e549713f7b 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -503,17 +503,20 @@ def disabled(self):
         -------
         Context manager for disabling things
         """
-        try:
-            self._use_fast_lib_lock.acquire()
-            # The same thread might enter this context manager
-            # multiple times, so we need to remember the previous
-            # value
+        with self._use_fast_lib_lock:
+            # Have to hold the lock to modify this variable since
+            # another thread might be reading it.
+            # Modification has to happen with the lock held for the
+            # duration, so if someone else has modified things, then
+            # we block trying to acquire the lock (hence it is safe to
+            # release the lock after modifying this value)
             saved = self._use_fast_lib
             self._use_fast_lib = False
+        try:
             yield
         finally:
-            self._use_fast_lib = saved
-            self._use_fast_lib_lock.release()
+            with self._use_fast_lib_lock:
+                self._use_fast_lib = saved
 
     @staticmethod
     def getattr_real_or_wrapped(
@@ -613,7 +616,7 @@ def disable_module_accelerator() -> contextlib.ExitStack:
     """
     Temporarily disable any module acceleration.
     """
-    with contextlib.ExitStack() as stack:
+    with ImportLock(), contextlib.ExitStack() as stack:
         for finder in sys.meta_path:
             if isinstance(finder, ModuleAcceleratorBase):
                 stack.enter_context(finder.disabled())
diff --git a/python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py b/python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py
new file mode 100644
index 00000000000..2cc6cc1ef5b
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+import queue
+from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait
+
+
+# Function to be called in each thread
+def thread_function(index):
+    # Import the library in the thread
+    import pandas as pd
+
+    x = pd.Series([1, 2, 3])
+
+    return f"{index}" + str(type(type(x)))
+
+
+def main():
+    # Number of threads to use
+    num_threads = 4
+
+    # Queue of tasks to be processed by the threads
+    task_queue = queue.Queue()
+    for i in range(num_threads):
+        task_queue.put((i,))
+
+    # List to hold the futures
+    futures = []
+
+    # Use ThreadPoolExecutor to manage the threads
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        while not task_queue.empty():
+            task = task_queue.get()
+            future = executor.submit(thread_function, *task)
+            futures.append(future)
+
+    # Wait for all threads to complete
+    _, _ = wait(futures, return_when=ALL_COMPLETED)
+
+    # Process the results
+    for i, future in enumerate(futures):
+        result = future.result()
+        print(f"Result from thread {i + 1}: {result}")
+
+
+if __name__ == "__main__":
+    from cudf.pandas.module_accelerator import disable_module_accelerator
+
+    with disable_module_accelerator():
+        main()
diff --git a/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
new file mode 100644
index 00000000000..c7af6cc5ebf
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+import os
+import subprocess
+
+from cudf.testing import _utils as utils
+
+
+def test_disable_pandas_accelerator_multi_threaded():
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+
+    with utils.cudf_timeout(10):
+        sp_completed = subprocess.run(
+            [
+                "python",
+                "-m",
+                "cudf.pandas",
+                data_directory + "/data/disable_cudf_pandas_multi_thread.py",
+            ],
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+    assert sp_completed.returncode == 0
+    output = sp_completed.stdout
+
+    for string in [
+        "Result from thread 1: 0<class 'type'>",
+        "Result from thread 2: 1<class 'type'>",
+        "Result from thread 3: 2<class 'type'>",
+        "Result from thread 4: 3<class 'type'>",
+    ]:
+        assert string in output
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
index 326224c8fc0..9db49d55c90 100644
--- a/python/cudf/cudf_pandas_tests/test_main.py
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -57,7 +57,7 @@ def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
     expect = _run_python(cudf_pandas=False, command=input_args_and_code)
 
     assert "cudf" in res.stdout
-    assert "cudf" not in expect.stdout
+    assert "<module 'pandas' from" in expect.stdout
 
 
 def test_cudf_pandas_script_repl():
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index a5c29bd93a2..f4426e09ef8 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -59,7 +59,7 @@ def test_profiler():
         "pd.DataFrame",
         "",
         "rng.integers",
-        "np.random.rand",
+        "rng.random",
         'df.groupby("idx").sum',
         'df.sum()["data"]',
         "np.isclose",

From 551e452648564e92cef17e2c6047a3471a5cf4ec Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 26 Jan 2025 10:31:00 -0800
Subject: [PATCH 14/35] Make more constexpr available on device for cuIO
 (#17746)

Contributes to https://github.com/rapidsai/cudf/issues/7795

This PR addressed most of the relaxed constexpr in cuIO.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17746
---
 .../cudf/detail/utilities/integer_utils.hpp   | 18 ++---
 cpp/include/cudf/fixed_point/temporary.hpp    |  6 +-
 .../cudf/io/text/detail/multistate.hpp        | 16 +++--
 .../strings/detail/convert/fixed_point.cuh    |  8 ++-
 .../detail/convert/fixed_point_to_string.cuh  |  8 ++-
 .../strings/detail/convert/int_to_string.cuh  |  4 +-
 .../cudf/strings/detail/strings_children.cuh  | 24 +++++--
 cpp/src/io/csv/datetime.cuh                   |  4 +-
 cpp/src/io/json/write_json.cu                 | 70 ++++++++++++++-----
 cpp/src/io/orc/orc.hpp                        |  4 +-
 cpp/src/io/orc/stats_enc.cu                   |  4 +-
 cpp/src/io/orc/stripe_enc.cu                  |  7 +-
 cpp/src/io/orc/stripe_init.cu                 |  5 +-
 cpp/src/io/orc/writer_impl.cu                 |  2 +-
 .../io/parquet/compact_protocol_reader.cpp    |  4 +-
 cpp/src/io/parquet/decode_preprocess.cu       |  4 +-
 cpp/src/io/parquet/delta_binary.cuh           |  6 +-
 cpp/src/io/parquet/delta_enc.cuh              | 12 ++--
 cpp/src/io/parquet/page_decode.cuh            |  5 +-
 cpp/src/io/parquet/page_enc.cu                | 66 ++++++++---------
 cpp/src/io/parquet/parquet.hpp                | 26 +++----
 cpp/src/io/parquet/parquet_gpu.hpp            | 42 +++++++----
 cpp/src/io/parquet/reader_impl_chunking.cu    |  4 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  1 +
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  2 +-
 cpp/src/io/parquet/rle_stream.cuh             |  4 +-
 cpp/src/io/text/multibyte_split.cu            | 14 ++--
 cpp/src/io/utilities/data_casting.cu          |  4 +-
 cpp/src/io/utilities/output_builder.cuh       |  4 +-
 cpp/src/io/utilities/parsing_utils.cuh        | 29 ++++----
 30 files changed, 245 insertions(+), 162 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 44a86f1c84f..135f645817e 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -73,7 +73,7 @@ CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_down_safe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
@@ -113,16 +113,16 @@ CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexc
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
+CUDF_HOST_DEVICE constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
-constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                 I dividend,
-                                 I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant<bool, false>,
+                                                  I dividend,
+                                                  I divisor) noexcept
 {
   // TODO: This could probably be implemented faster
   return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
@@ -130,7 +130,9 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
 }
 
 template <typename I>
-constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend, I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant<bool, true>,
+                                                  I dividend,
+                                                  I divisor) noexcept
 {
   auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
@@ -156,9 +158,9 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
  * the non-integral division `dividend/divisor`
  */
 template <typename I>
-constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
 {
-  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed_v<I>>;
+  using i_is_a_signed_type = cuda::std::integral_constant<bool, cuda::std::is_signed_v<I>>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 2bafe235058..643d1d07cb7 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ auto to_string(T value) -> std::string
 }
 
 template <typename T>
-constexpr auto abs(T value)
+CUDF_HOST_DEVICE constexpr auto abs(T value)
 {
   return value >= 0 ? value : -value;
 }
@@ -72,7 +72,7 @@ CUDF_HOST_DEVICE inline auto max(T lhs, T rhs)
 }
 
 template <typename BaseType>
-constexpr auto exp10(int32_t exponent)
+CUDF_HOST_DEVICE constexpr auto exp10(int32_t exponent)
 {
   BaseType value = 1;
   while (exponent > 0)
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index 32187b43d34..24b8738d5dd 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/export.hpp>
 
+#include <cuda/functional>
+
 #include <cstdint>
 
 namespace CUDF_EXPORT cudf {
@@ -45,7 +47,7 @@ struct multistate {
    *
    * @note: The behavior of this function is undefined if size() => max_segment_count
    */
-  constexpr void enqueue(uint8_t head, uint8_t tail)
+  CUDF_HOST_DEVICE constexpr void enqueue(uint8_t head, uint8_t tail)
   {
     _heads |= (head & 0xFu) << (_size * 4);
     _tails |= (tail & 0xFu) << (_size * 4);
@@ -55,17 +57,17 @@ struct multistate {
   /**
    * @brief get's the number of segments this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t size() const { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t size() const { return _size; }
 
   /**
    * @brief get's the highest (____, tail] value this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t max_tail() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t max_tail() const
   {
     uint8_t maximum = 0;
 
     for (uint8_t i = 0; i < _size; i++) {
-      maximum = std::max(maximum, get_tail(i));
+      maximum = cuda::std::max(maximum, get_tail(i));
     }
 
     return maximum;
@@ -74,7 +76,7 @@ struct multistate {
   /**
    * @brief get's the Nth (head, ____] value state this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_head(uint8_t idx) const
   {
     return (_heads >> (idx * 4)) & 0xFu;
   }
@@ -82,7 +84,7 @@ struct multistate {
   /**
    * @brief get's the Nth (____, tail] value state this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_tail(uint8_t idx) const
   {
     return (_tails >> (idx * 4)) & 0xFu;
   }
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 8440805960e..5ae4af411b6 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/fixed_point/temporary.hpp>
 
+#include <cuda/std/limits>
 #include <cuda/std/optional>
 #include <cuda/std/type_traits>
 #include <thrust/pair.h>
@@ -46,7 +47,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
   // highest value where another decimal digit cannot be appended without an overflow;
   // this preserves the most digits when scaling the final result for this type
   constexpr UnsignedDecimalType decimal_max =
-    (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
+    (cuda::std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
   __uint128_t value  = 0;  // for checking overflow
   int32_t exp_offset = 0;
@@ -90,7 +91,8 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
 template <bool check_only = false>
 __device__ cuda::std::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
 {
-  constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+  constexpr uint32_t exponent_max =
+    static_cast<uint32_t>(cuda::std::numeric_limits<int32_t>::max());
 
   // get optional exponent sign
   int32_t const exp_sign = [&iter] {
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
index 0ee26ec9ee2..af4a4ce7cd2 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 
+#include <cuda/std/functional>
+
 namespace cudf::strings::detail {
 
 /**
@@ -33,7 +35,7 @@ __device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32
   auto const abs_value = numeric::detail::abs(value);
   auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
   auto const fraction  = count_digits(abs_value % exp_ten);
-  auto const num_zeros = std::max(0, (-scale - fraction));
+  auto const num_zeros = cuda::std::max(0, (-scale - fraction));
   return static_cast<int32_t>(value < 0) +    // sign if negative
          count_digits(abs_value / exp_ten) +  // integer
          1 +                                  // decimal point
@@ -66,7 +68,7 @@ __device__ inline void fixed_point_to_string(__int128_t const& value, int32_t sc
   if (value < 0) *out_ptr++ = '-';  // add sign
   auto const abs_value = numeric::detail::abs(value);
   auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
-  auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+  auto const num_zeros = cuda::std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
   out_ptr += integer_to_string(abs_value / exp_ten, out_ptr);  // add the integer part
   *out_ptr++ = '.';                                            // add decimal point
diff --git a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
index f6e6a10a864..39b9cd6978c 100644
--- a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
+++ b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
  * @return size_type number of digits in input value
  */
 template <typename IntegerType>
-constexpr size_type count_digits(IntegerType value)
+__device__ constexpr size_type count_digits(IntegerType value)
 {
   if (value == 0) return 1;
   bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index de2f1770e28..cf19baf4826 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,21 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+template <typename Iter>
+struct string_offsets_fn {
+  Iter _begin;
+  size_type _strings_count;
+  constexpr string_offsets_fn(Iter begin, size_type strings_count)
+    : _begin{begin}, _strings_count{strings_count}
+  {
+  }
+
+  __device__ constexpr size_type operator()(size_type idx) const noexcept
+  {
+    return idx < _strings_count ? static_cast<size_type>(_begin[idx]) : size_type{0};
+  };
+};
+
 /**
  * @brief Gather characters to create a strings column using the given string-index pair iterator
  *
@@ -133,11 +148,8 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   // using exclusive-scan technically requires strings_count+1 input values even though
   // the final input value is never used.
   // The input iterator is wrapped here to allow the 'last value' to be safely read.
-  auto map_fn = cuda::proclaim_return_type<size_type>(
-    [begin, strings_count] __device__(size_type idx) -> size_type {
-      return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
-    });
-  auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+  auto input_itr =
+    cudf::detail::make_counting_transform_iterator(0, string_offsets_fn{begin, strings_count});
   // Use the sizes-to-offsets iterator to compute the total number of elements
   auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index bfdba238a1e..0463eca65e9 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,7 +197,7 @@ __inline__ __device__ cuda::std::chrono::hh_mm_ss<duration_ms> extract_time_of_d
 /**
  * @brief Checks whether `c` is decimal digit
  */
-constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
+__device__ constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
 
 /**
  * @brief Parses a datetime string and computes the corresponding timestamp.
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 1a0c59e365a..1587c4da9c8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -376,6 +376,48 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
     {});
 }
 
+struct scatter_fn {
+  column_device_view _col;
+  size_type* _d_strview_offsets;
+  string_view* _d_strviews;
+  size_type const* _labels;
+  size_type const* _list_offsets;
+  column_device_view _d_strings_children;
+  string_view _element_seperator;
+  string_view _element_narep;
+
+  scatter_fn(column_device_view col,
+             size_type* d_strview_offsets,
+             string_view* d_strviews,
+             size_type const* labels,
+             size_type const* list_offsets,
+             column_device_view d_strings_children,
+             string_view const element_separator,
+             string_view const element_narep) noexcept
+    : _col{col},
+      _d_strview_offsets{d_strview_offsets},
+      _d_strviews{d_strviews},
+      _labels{labels},
+      _list_offsets{list_offsets},
+      _d_strings_children{d_strings_children},
+      _element_seperator{element_separator},
+      _element_narep{element_narep}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto const label         = _labels[idx];
+    auto const sublist_index = idx - _list_offsets[label];
+    auto const strview_index = _d_strview_offsets[label] + sublist_index * 2 + 1;
+    // value or na_rep
+    auto const strview         = _d_strings_children.element<cudf::string_view>(idx);
+    _d_strviews[strview_index] = _d_strings_children.is_null(idx) ? _element_narep : strview;
+    // separator
+    if (sublist_index != 0) { _d_strviews[strview_index - 1] = _element_seperator; }
+  }
+};
+
 /**
  * @brief Concatenates a list of strings columns into a single strings column.
  *
@@ -461,24 +503,14 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_strings),
-                   [col                = *col_device_view,
-                    d_strview_offsets  = d_strview_offsets.begin(),
-                    d_strviews         = d_strviews.begin(),
-                    labels             = labels->view().begin<size_type>(),
-                    list_offsets       = offsets.begin<size_type>(),
-                    d_strings_children = *d_strings_children,
-                    element_separator,
-                    element_narep] __device__(auto idx) {
-                     auto const label         = labels[idx];
-                     auto const sublist_index = idx - list_offsets[label];
-                     auto const strview_index = d_strview_offsets[label] + sublist_index * 2 + 1;
-                     // value or na_rep
-                     auto const strview = d_strings_children.element<cudf::string_view>(idx);
-                     d_strviews[strview_index] =
-                       d_strings_children.is_null(idx) ? element_narep : strview;
-                     // separator
-                     if (sublist_index != 0) { d_strviews[strview_index - 1] = element_separator; }
-                   });
+                   scatter_fn{*col_device_view,
+                              d_strview_offsets.data(),
+                              d_strviews.data(),
+                              labels->view().data<size_type>(),
+                              offsets.data<size_type>(),
+                              *d_strings_children,
+                              element_separator,
+                              element_narep});
 
   auto joined_col = make_strings_column(d_strviews, string_view{nullptr, 0}, stream, mr);
 
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 5ab36fdae8e..8dccf65ef10 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -707,7 +707,7 @@ struct orc_column_device_view : public column_device_view {
 struct rowgroup_rows {
   size_type begin;
   size_type end;
-  [[nodiscard]] constexpr auto size() const noexcept { return end - begin; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; }
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index e01b93262d7..5f4c1e0696d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -22,6 +22,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/utility>
+
 namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
@@ -212,7 +214,7 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r
 }
 
 // Splits a nanosecond timestamp into milliseconds and nanoseconds
-__device__ std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
+__device__ cuda::std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
 {
   auto const ns           = cuda::std::chrono::nanoseconds(nano_count);
   auto const ms_floor     = cuda::std::chrono::floor<cuda::std::chrono::milliseconds>(ns);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index bcdd059bf67..857daeb5856 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
@@ -413,8 +414,8 @@ static __device__ uint32_t IntegerRLE(
     // Find minimum and maximum values
     if (literal_run > 0) {
       // Find min & max
-      T vmin = (t < literal_run) ? v0 : std::numeric_limits<T>::max();
-      T vmax = (t < literal_run) ? v0 : std::numeric_limits<T>::min();
+      T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::max();
+      T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
       vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
       __syncthreads();
@@ -448,7 +449,7 @@ static __device__ uint32_t IntegerRLE(
         } else {
           uint32_t range, w;
           // Mode 2 base value cannot be bigger than max int64_t, i.e. the first bit has to be 0
-          if (vmin <= std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
+          if (vmin <= cuda::std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
               (literal_run - 1) * (mode1_w - mode2_w) > 4) {
             s->u.intrle.literal_mode = 2;
             w                        = mode2_w;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 0c739f59b0a..5e23bc5adcc 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
@@ -243,9 +244,9 @@ enum row_entry_state_e {
  */
 static auto __device__ index_order_from_index_types(uint32_t index_types_bitmap)
 {
-  constexpr std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
+  constexpr cuda::std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
 
-  std::array<uint32_t, full_order.size()> partial_order;
+  cuda::std::array<uint32_t, full_order.size()> partial_order;
   thrust::copy_if(thrust::seq,
                   full_order.cbegin(),
                   full_order.cend(),
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a0cd126cff0..5c3377a1aeb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -71,7 +71,7 @@
 namespace cudf::io::orc::detail {
 
 template <typename T>
-[[nodiscard]] constexpr int varint_size(T val)
+[[nodiscard]] CUDF_HOST_DEVICE constexpr int varint_size(T val)
 {
   auto len = 1u;
   while (val > 0x7f) {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index b8e72aaac88..023402cbcf6 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -359,10 +359,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  std::optional<T>& val;  // union structs are always wrapped in std::optional
+  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 5b9831668e6..2f402e3c4b8 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ __device__ size_type gpuDeltaLengthPageStringSize(page_state_s* s, int t)
     delta_binary_decoder string_lengths;
     auto const* string_start = string_lengths.find_end_of_block(s->data_start, s->data_end);
     // distance is size of string data
-    return static_cast<size_type>(std::distance(string_start, s->data_end));
+    return static_cast<size_type>(thrust::distance(string_start, s->data_end));
   }
   return 0;
 }
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 1fa05b3a6c2..339a6233c4d 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ struct delta_binary_decoder {
 
   // returns the value stored in the `value` array at index
   // `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
-  constexpr zigzag128_t value_at(size_type idx)
+  __device__ constexpr zigzag128_t value_at(size_type idx)
   {
     return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
   }
@@ -113,7 +113,7 @@ struct delta_binary_decoder {
   // returns the number of values encoded in the block data. when all_values is true,
   // account for the first value in the header. otherwise just count the values encoded
   // in the mini-block data.
-  constexpr uint32_t num_encoded_values(bool all_values)
+  __device__ constexpr uint32_t num_encoded_values(bool all_values)
   {
     return value_count == 0 ? 0 : all_values ? value_count : value_count - 1;
   }
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
index 49f4ccedbf0..56b7c8065ee 100644
--- a/cpp/src/io/parquet/delta_enc.cuh
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 namespace cudf::io::parquet::detail {
 
@@ -57,7 +59,7 @@ constexpr int buffer_size           = 2 * block_size;
 static_assert(block_size % 128 == 0);
 static_assert(values_per_mini_block % 32 == 0);
 
-constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+__device__ constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
 
 // Version of bit packer that can handle up to 64 bits values.
 // T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
@@ -67,8 +69,8 @@ template <typename scratch_type>
 inline __device__ void bitpack_mini_block(
   uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
 {
-  using wide_type =
-    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using wide_type = cuda::std::
+    conditional_t<cuda::std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
   using cudf::detail::warp_size;
   scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
   auto constexpr div          = sizeof(scratch_type) * 8;
@@ -235,7 +237,7 @@ class delta_binary_packer {
     size_type const idx = _current_idx + t;
     T const delta       = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
                                                  _buffer[delta::rolling_idx(idx - 1)])
-                                            : std::numeric_limits<T>::max();
+                                            : cuda::std::numeric_limits<T>::max();
 
     // Find min delta for the block.
     auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min());
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a5023e23cc5..b101733d35e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -28,7 +28,7 @@
 namespace cudf::io::parquet::detail {
 
 struct page_state_s {
-  constexpr page_state_s() noexcept {}
+  CUDF_HOST_DEVICE constexpr page_state_s() noexcept {}
   uint8_t const* data_start{};
   uint8_t const* data_end{};
   uint8_t const* lvl_end{};
@@ -121,7 +121,8 @@ struct null_count_back_copier {
 /**
  * @brief Test if the given page is in a string column
  */
-constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+__device__ constexpr bool is_string_col(PageInfo const& page,
+                                        device_span<ColumnChunkDesc const> chunks)
 {
   if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 7dc1255af6f..56d638c68eb 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -32,6 +32,9 @@
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
 #include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/gather.h>
@@ -59,7 +62,7 @@ constexpr int encode_block_size = 128;
 constexpr int rle_buffer_size   = 2 * encode_block_size;
 constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
 
-constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
+__device__ constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // max V1 header size
 // also valid for dict page header (V1 or V2)
@@ -113,7 +116,7 @@ using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
-constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
+__device__ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
 {
   if (physical_type == FIXED_LEN_BYTE_ARRAY) {
     return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
@@ -127,7 +130,7 @@ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_le
   }
 }
 
-constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
+__device__ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
 {
   if (value_bit_width == 0) return 0;
 
@@ -145,7 +148,7 @@ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_value
 }
 
 // subtract b from a, but return 0 if this would underflow
-constexpr size_t underflow_safe_subtract(size_t a, size_t b)
+__device__ constexpr size_t underflow_safe_subtract(size_t a, size_t b)
 {
   if (b > a) { return 0; }
   return a - b;
@@ -228,7 +231,8 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 
   __syncthreads();
   // page fragment size must fit in a 32-bit signed integer
-  if (s->frag.fragment_data_size > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
+  if (s->frag.fragment_data_size >
+      static_cast<uint32_t>(cuda::std::numeric_limits<int32_t>::max())) {
     // TODO need to propagate this error back to the host
     CUDF_UNREACHABLE("page fragment size exceeds maximum for i32");
   }
@@ -357,7 +361,7 @@ struct BitwiseOr {
 template <Type PT, typename I>
 __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, void* temp_space)
 {
-  using output_type = std::conditional_t<PT == INT32, int32_t, int64_t>;
+  using output_type = cuda::std::conditional_t<PT == INT32, int32_t, int64_t>;
   __shared__ delta_binary_packer<output_type> packer;
 
   auto const t = threadIdx.x;
@@ -737,7 +741,7 @@ CUDF_KERNEL void __launch_bounds__(128)
           : frag_g.fragment_data_size;
 
       // page fragment size must fit in a 32-bit signed integer
-      if (fragment_data_size > std::numeric_limits<int32_t>::max()) {
+      if (fragment_data_size > cuda::std::numeric_limits<int32_t>::max()) {
         CUDF_UNREACHABLE("page fragment size exceeds maximum for i32");
       }
 
@@ -816,7 +820,7 @@ CUDF_KERNEL void __launch_bounds__(128)
             page_size + rle_pad +
             (write_v2_headers ? page_g.max_lvl_size : def_level_size + rep_level_size);
           // page size must fit in 32-bit signed integer
-          if (max_data_size > std::numeric_limits<int32_t>::max()) {
+          if (max_data_size > cuda::std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
           }
           // if byte_array then save the variable bytes size
@@ -1321,7 +1325,7 @@ static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
  * @return The difference between two epochs in `cuda::std::chrono::duration` format with a period
  * of hours.
  */
-constexpr auto julian_calendar_epoch_diff()
+__device__ constexpr auto julian_calendar_epoch_diff()
 {
   using namespace cuda::std::chrono;
   using namespace cuda::std::chrono_literals;
@@ -1346,7 +1350,7 @@ __device__ auto julian_days_with_time(int64_t v)
   auto const dur_time_of_day       = dur_total - dur_days;
   auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
   auto const julian_days           = dur_days + ceil<days>(julian_calendar_epoch_diff());
-  return std::make_pair(dur_time_of_day_nanos, julian_days);
+  return cuda::std::pair{dur_time_of_day_nanos, julian_days};
 }
 
 // this has been split out into its own kernel because of the amount of shared memory required
@@ -1711,7 +1715,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                      : 0;
         val_idx  = val_idx_in_leaf_col;
       }
-      return std::make_tuple(is_valid, val_idx);
+      return cuda::std::make_tuple(is_valid, val_idx);
     }();
 
     cur_val_idx += nvals;
@@ -1950,7 +1954,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       // need to test for use_dictionary because it might be boolean
       uint32_t const val_idx =
         (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
-      return std::make_tuple(is_valid, val_idx);
+      return cuda::std::tuple{is_valid, val_idx};
     }();
 
     cur_val_idx += nvals;
@@ -2200,7 +2204,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
         auto const arr_size =
           get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         // the lengths are assumed to be INT32, check for overflow
-        if (arr_size > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+        if (arr_size > static_cast<size_t>(cuda::std::numeric_limits<int32_t>::max())) {
           CUDF_UNREACHABLE("byte array size exceeds 2GB");
         }
         v = static_cast<int32_t>(arr_size);
@@ -2641,7 +2645,7 @@ class header_encoder {
       cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST);
     auto const t_num   = static_cast<uint8_t>(type);
     current_header_ptr = cpw_put_uint8(
-      current_header_ptr, static_cast<uint8_t>((std::min(len, size_t{0xfu}) << 4) | t_num));
+      current_header_ptr, static_cast<uint8_t>((cuda::std::min(len, size_t{0xfu}) << 4) | t_num));
     if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); }
     current_field_index = 0;
   }
@@ -2802,10 +2806,8 @@ __device__ bool increment_utf8_at(unsigned char* ptr)
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned char const> span,
-                                                          bool is_min,
-                                                          void* scratch,
-                                                          int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_utf8(
+  device_span<unsigned char const> span, bool is_min, void* scratch, int32_t truncate_length)
 {
   // we know at this point that truncate_length < size_bytes, so
   // there is data at [len]. work backwards until we find
@@ -2842,10 +2844,10 @@ __device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned c
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
-                                                            bool is_min,
-                                                            void* scratch,
-                                                            int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
+                                                                  bool is_min,
+                                                                  void* scratch,
+                                                                  int32_t truncate_length)
 {
   if (is_min) { return {arr.data(), truncate_length}; }
   memcpy(scratch, arr.data(), truncate_length);
@@ -2869,10 +2871,10 @@ __device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t
 /**
  * @brief Attempt to truncate a UTF-8 string to at most truncate_length bytes.
  */
-__device__ std::pair<void const*, uint32_t> truncate_string(string_view const& str,
-                                                            bool is_min,
-                                                            void* scratch,
-                                                            int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_string(string_view const& str,
+                                                                  bool is_min,
+                                                                  void* scratch,
+                                                                  int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or str.size_bytes() <= truncate_length) {
     return {str.data(), str.size_bytes()};
@@ -2893,7 +2895,7 @@ __device__ std::pair<void const*, uint32_t> truncate_string(string_view const& s
 /**
  * @brief Attempt to truncate a binary array to at most truncate_length bytes.
  */
-__device__ std::pair<void const*, uint32_t> truncate_byte_array(
+__device__ cuda::std::pair<void const*, uint32_t> truncate_byte_array(
   statistics::byte_array_view const& arr, bool is_min, void* scratch, int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or arr.size_bytes() <= truncate_length) {
@@ -2914,11 +2916,11 @@ __device__ std::pair<void const*, uint32_t> truncate_byte_array(
  * valid min or max binary value.  String and byte array types will be truncated if they exceed
  * truncate_length.
  */
-__device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
-                                                         statistics_dtype dtype,
-                                                         void* scratch,
-                                                         bool is_min,
-                                                         int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
+                                                               statistics_dtype dtype,
+                                                               void* scratch,
+                                                               bool is_min,
+                                                               int32_t truncate_length)
 {
   switch (dtype) {
     case dtype_bool: return {stats_val, sizeof(bool)};
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index dc0c4b1540e..f7cbe2bd924 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,6 +20,8 @@
 
 #include <cudf/types.hpp>
 
+#include <cuda/std/optional>
+
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -92,10 +94,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  std::optional<DecimalType> decimal_type;
-  std::optional<TimeType> time_type;
-  std::optional<TimestampType> timestamp_type;
-  std::optional<IntType> int_type;
+  cuda::std::optional<DecimalType> decimal_type;
+  cuda::std::optional<TimeType> time_type;
+  cuda::std::optional<TimestampType> timestamp_type;
+  cuda::std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -103,36 +105,36 @@ struct LogicalType {
   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
 
-  [[nodiscard]] constexpr bool is_time_millis() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
   }
 
-  [[nodiscard]] constexpr bool is_time_micros() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MICROS;
   }
 
-  [[nodiscard]] constexpr bool is_time_nanos() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::NANOS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_millis() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_micros() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_nanos() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
   }
-  [[nodiscard]] constexpr int8_t bit_width() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const
   {
     return type == INTEGER ? int_type->bitWidth : -1;
   }
@@ -144,7 +146,7 @@ struct LogicalType {
     return type == DECIMAL ? decimal_type->scale : -1;
   }
 
-  [[nodiscard]] constexpr int32_t precision() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const
   {
     return type == DECIMAL ? decimal_type->precision : -1;
   }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3c8d32572f8..4425f49d82d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -34,6 +34,7 @@
 #include <cuda/atomic>
 #include <cuda_runtime.h>
 
+#include <optional>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -52,7 +53,7 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 
 template <int rolling_size>
-constexpr int rolling_index(int index)
+CUDF_HOST_DEVICE constexpr int rolling_index(int index)
 {
   // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never
   // actual be executed.
@@ -78,7 +79,7 @@ constexpr uint8_t REP_LVL_HIST_CUTOFF = 0;
 constexpr uint8_t DEF_LVL_HIST_CUTOFF = 0;
 
 // see setupLocalPageInfo() in page_decode.cuh for supported page encodings
-constexpr bool is_supported_encoding(Encoding enc)
+CUDF_HOST_DEVICE constexpr bool is_supported_encoding(Encoding enc)
 {
   switch (enc) {
     case Encoding::PLAIN:
@@ -96,7 +97,8 @@ constexpr bool is_supported_encoding(Encoding enc)
 /**
  * @brief Atomically OR `error` into `error_code`.
  */
-constexpr void set_error(kernel_error::value_type error, kernel_error::pointer error_code)
+__device__ constexpr void set_error(kernel_error::value_type error,
+                                    kernel_error::pointer error_code)
 {
   if (error != 0) {
     cuda::atomic_ref<kernel_error::value_type, cuda::thread_scope_device> ref{*error_code};
@@ -162,14 +164,14 @@ using std::is_scoped_enum;
 // helpers to do bit operations on scoped enums
 template <typename... Ts,
           CUDF_ENABLE_IF(... && (std::is_same_v<std::uint32_t, Ts> || is_scoped_enum<Ts>::value))>
-constexpr std::uint32_t BitAnd(Ts... bits)
+CUDF_HOST_DEVICE constexpr std::uint32_t BitAnd(Ts... bits)
 {
   return (... & static_cast<std::uint32_t>(bits));
 }
 
 template <typename... Ts,
           CUDF_ENABLE_IF(... && (std::is_same_v<std::uint32_t, Ts> || is_scoped_enum<Ts>::value))>
-constexpr std::uint32_t BitOr(Ts... bits)
+CUDF_HOST_DEVICE constexpr std::uint32_t BitOr(Ts... bits)
 {
   return (... | static_cast<std::uint32_t>(bits));
 }
@@ -401,7 +403,7 @@ inline auto make_page_key_iterator(device_span<PageInfo const> pages)
  * @brief Struct describing a particular chunk of column data
  */
 struct ColumnChunkDesc {
-  constexpr ColumnChunkDesc() noexcept {};
+  CUDF_HOST_DEVICE constexpr ColumnChunkDesc() noexcept {};
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
@@ -498,8 +500,8 @@ struct parquet_column_device_view : stats_column_desc {
   int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
-  [[nodiscard]] constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
-  [[nodiscard]] constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
+  [[nodiscard]] __device__ constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
+  [[nodiscard]] __device__ constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
   uint8_t max_def_level;  //!< needed for SizeStatistics calculation
   uint8_t max_rep_level;
 
@@ -540,7 +542,7 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 struct EncPage;
 
 // convert Encoding to a mask value
-constexpr uint32_t encoding_to_mask(Encoding encoding)
+CUDF_HOST_DEVICE constexpr uint32_t encoding_to_mask(Encoding encoding)
 {
   return 1 << static_cast<uint32_t>(encoding);
 }
@@ -601,9 +603,15 @@ struct EncColumnChunk {
   uint32_t* rep_histogram_data;  //!< Size is (max(level) + 1) * (num_data_pages + 1).
   size_t var_bytes_size;         //!< Sum of var_bytes_size from the pages (byte arrays only)
 
-  [[nodiscard]] constexpr uint32_t num_dict_pages() const { return use_dictionary ? 1 : 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_dict_pages() const
+  {
+    return use_dictionary ? 1 : 0;
+  }
 
-  [[nodiscard]] constexpr uint32_t num_data_pages() const { return num_pages - num_dict_pages(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_data_pages() const
+  {
+    return num_pages - num_dict_pages();
+  }
 };
 
 /**
@@ -642,15 +650,21 @@ struct EncPage {
   Encoding encoding;       //!< Encoding used for page data
   uint16_t num_fragments;  //!< Number of fragments in page
 
-  [[nodiscard]] constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_v2() const
+  {
+    return page_type == PageType::DATA_PAGE_V2;
+  }
 
-  [[nodiscard]] constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto level_bytes() const
+  {
+    return def_lvl_bytes + rep_lvl_bytes;
+  }
 };
 
 /**
  * @brief Test if the given column chunk is in a string column
  */
-constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+__device__ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
   // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing
   // a decimal.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 933be889b1a..03a37327e9b 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1079,7 +1079,7 @@ struct decomp_sum {
   {
     return {a.codec,
             a.num_pages + b.num_pages,
-            std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
+            cuda::std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
             a.total_decompressed_size + b.total_decompressed_size};
   }
 };
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 25baa1e0ec8..7d3b6a39d5b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -30,6 +30,7 @@
 
 #include <functional>
 #include <numeric>
+#include <optional>
 #include <regex>
 
 namespace cudf::io::parquet::detail {
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 43666f9e42d..3874346e471 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -649,7 +649,7 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
-constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+__device__ constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
 {
   auto const is_decimal =
     chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 3c49de0c997..2de2670b7a7 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 namespace cudf::io::parquet::detail {
 
 template <int num_threads>
-constexpr int rle_stream_required_run_buffer_size()
+__device__ constexpr int rle_stream_required_run_buffer_size()
 {
   constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1;
   return (num_rle_stream_decode_warps * 2);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 028f922bec3..37b1608463b 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ int32_t constexpr ITEMS_PER_TILE   = ITEMS_PER_THREAD * THREADS_PER_TILE;
 int32_t constexpr TILES_PER_CHUNK  = 4096;
 int32_t constexpr ITEMS_PER_CHUNK  = ITEMS_PER_TILE * TILES_PER_CHUNK;
 
-constexpr multistate transition_init(char c, cudf::device_span<char const> delim)
+__device__ constexpr multistate transition_init(char c, cudf::device_span<char const> delim)
 {
   auto result = multistate();
 
@@ -79,7 +79,9 @@ constexpr multistate transition_init(char c, cudf::device_span<char const> delim
   return result;
 }
 
-constexpr multistate transition(char c, multistate state, cudf::device_span<char const> delim)
+__device__ constexpr multistate transition(char c,
+                                           multistate state,
+                                           cudf::device_span<char const> delim)
 {
   auto result = multistate();
 
@@ -182,7 +184,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   auto const thread_input_offset =
     tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
   auto const thread_input_size =
-    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
+    cuda::std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -257,7 +259,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
   auto const thread_input_offset =
     tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
   auto const thread_input_size =
-    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
+    cuda::std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -555,7 +557,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
           if (row == last_row && insert_end) {
             return thrust::make_pair(chars + begin, len);
           } else {
-            return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+            return thrust::make_pair(chars + begin, cuda::std::max<size_type>(0, len - delim_size));
           };
         }));
     return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 0c49b2e5d78..2750a17d328 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
  * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
  * \p out_it
  */
-constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
+__device__ constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
 {
   auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
                                          : strings::detail::from_char_utf8(character, out_it);
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 8183a66f4f0..46a3880df84 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ class split_device_span {
   {
   }
 
-  [[nodiscard]] constexpr reference operator[](size_type i) const
+  [[nodiscard]] __device__ constexpr reference operator[](size_type i) const
   {
     return i < _head.size() ? _head[i] : _tail[i - _head.size()];
   }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 9833dab282e..a30ede957ec 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,7 +30,10 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/limits>
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
@@ -158,7 +161,7 @@ __device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escape
  * @return uint8_t Numeric value of the character, or `0`
  */
 template <typename T, bool as_hex = false>
-constexpr uint8_t decode_digit(char c, bool* valid_flag)
+__device__ constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
   if constexpr (as_hex and std::is_integral_v<T>) {
@@ -210,9 +213,9 @@ CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
-                                                         char const* end,
-                                                         parse_options_view const& opts)
+CUDF_HOST_DEVICE cuda::std::optional<T> parse_numeric(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -222,8 +225,8 @@ __host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
   int32_t sign = (*begin == '-') ? -1 : 1;
 
   // Handle infinity
-  if (std::is_floating_point_v<T> && is_infinity(begin, end)) {
-    return sign * std::numeric_limits<T>::infinity();
+  if (cuda::std::is_floating_point_v<T> && is_infinity(begin, end)) {
+    return sign * cuda::std::numeric_limits<T>::infinity();
   }
   if (*begin == '-' || *begin == '+') begin++;
 
@@ -244,7 +247,7 @@ __host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
     ++begin;
   }
 
-  if (std::is_floating_point_v<T>) {
+  if (cuda::std::is_floating_point_v<T>) {
     // Handle fractional part of the number if necessary
     double divisor = 1;
     while (begin < end) {
@@ -449,7 +452,7 @@ __inline__ __device__ It skip_character(It const& it, char ch)
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quotes(
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_whitespaces_quotes(
   char const* begin, char const* end, char quotechar = '\0')
 {
   auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
@@ -471,8 +474,8 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char const* begin,
-                                                                           char const* end)
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_whitespaces(char const* begin,
+                                                                                 char const* end)
 {
   auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
 
@@ -495,9 +498,9 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
-                                                                      char const* end,
-                                                                      char quotechar)
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_quotes(char const* begin,
+                                                                            char const* end,
+                                                                            char quotechar)
 {
   if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
        *thrust::prev(end) == quotechar)) {

From da4533d68640c92a0e1fd7f9ac943b3eb642cd55 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 27 Jan 2025 08:05:17 -0800
Subject: [PATCH 15/35] address review

---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf-polars/meta.yaml           |  2 +-
 dependencies.yaml                             |  2 +-
 .../cudf_polars/dsl/expressions/string.py     |  2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 22 ++++++-----
 .../cudf_polars/cudf_polars/testing/plugin.py | 25 ------------
 .../cudf_polars/cudf_polars/utils/versions.py |  1 -
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/test_join.py         | 39 +++++--------------
 10 files changed, 28 insertions(+), 71 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2299372c0c7..cc01f5286ef 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -67,7 +67,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.20,<1.21
+- polars>=1.20,<1.22
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<20.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index c671cf15128..d52cb85abe6 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -65,7 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.20,<1.21
+- polars>=1.20,<1.22
 - pre-commit
 - pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index c077d5e9417..fb7ab9332d8 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.20,<1.21
+    - polars >=1.20,<1.22
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index 2d3d6793685..30d477c91be 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -777,7 +777,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.20,<1.21
+          - polars>=1.20,<1.22
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index e51ac7977ce..a1c98a2ce1b 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -41,7 +41,7 @@ class Name(IntEnum):
         ConcatHorizontal = auto()
         ConcatVertical = auto()
         Contains = auto()
-        ContainsMany = auto()
+        ContainsAny = auto()
         CountMatches = auto()
         EndsWith = auto()
         EscapeRegex = auto()
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index f30246ac1cd..74f026e57cd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1290,25 +1290,27 @@ def do_evaluate(
                 lg, rg = cls._reorder_maps(
                     left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
                 )
-            if coalesce and how == "Inner":
-                right = right.discard_columns(right_on.column_names_set)
+            if coalesce:
+                if how == "Full":
+                    # In this case, keys must be column references,
+                    # possibly with dtype casting. We should use them in
+                    # preference to the columns from the original tables.
+                    left = left.with_columns(left_on.columns, replace_only=True)
+                    right = right.with_columns(right_on.columns, replace_only=True)
+                else:
+                    right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
             )
             right = DataFrame.from_table(
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
-            if coalesce and how != "Inner":
+            if coalesce and how == "Full":
                 left = left.with_columns(
                     (
                         Column(
-                            plc.replace.replace_nulls(
-                                left_col.obj,
-                                right_col.astype(left_col.obj.type()).obj
-                                if left_col.obj.type().id() != right_col.obj.type().id()
-                                else right_col.obj,
-                            ),
-                            name=left_col.name or right_col.name,
+                            plc.replace.replace_nulls(left_col.obj, right_col.obj),
+                            name=left_col.name,
                         )
                         for left_col, right_col in zip(
                             left.select_columns(left_on.column_names_set),
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 925721438c1..ccf21886e48 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -179,32 +179,7 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
-    "tests/unit/operations/test_join.py::test_join_lit_panic_11410": "no join ordering is preserved",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int64']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int64']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int32']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int32']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int16']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int16']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int8']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'Int128', 'Int8']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int128']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int128']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int64']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int64']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int32']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int32']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int16']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int16']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int8']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt64', 'Int8']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt32', 'Int128']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt32', 'Int128']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt16', 'Int128']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt16', 'Int128']-swap=False]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt8', 'Int128']-swap=True]": "casting int128 not supported",
-    "tests/unit/operations/test_join.py::test_join_numeric_type_upcast_15338[dtypes-['Int128', 'UInt8', 'Int128']-swap=False]": "casting int128 not supported",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 85875a8753d..e9d735bdf72 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,7 +12,6 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_LT_119 = POLARS_VERSION < parse("1.19")
 POLARS_VERSION_LT_120 = POLARS_VERSION < parse("1.20")
 
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 30f68ae4de4..15547f85d56 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.20,<1.21",
+    "polars>=1.20,<1.22",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index e5a22fd4f7b..4dbf972dc9f 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -121,39 +121,20 @@ def test_join_literal_key(left, right, left_on, right_on):
 
 
 @pytest.mark.parametrize(
-    "conditions, expr_id",
+    "conditions",
     [
-        ([pl.col("a") < pl.col("a_right")], "expr_0"),
-        (
-            [
-                pl.col("a_right") <= pl.col("a") * 2,
-                pl.col("a_right") <= 2 * pl.col("a"),
-            ],
-            "expr_1",
-        ),
-        (
-            [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
-            "expr_2",
-        ),
-        (
-            [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
-            "expr_3",
-        ),
-        (
-            [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
-            "expr_4",
-        ),
+        [pl.col("a") < pl.col("a_right")],
+        [
+            pl.col("a_right") <= pl.col("a") * 2,
+            pl.col("a_right") <= 2 * pl.col("a"),
+        ],
+        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
     ],
 )
 @pytest.mark.parametrize("zlice", [None, (0, 5)])
-def test_join_where(request, left, right, conditions, zlice, expr_id):
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(expr_id == "expr_3" and zlice is not None),
-            reason="Failing due to https://github.com/pola-rs/polars/issues/20831. Remove when we upgrade to polars>1.20",
-        )
-    )
-
+def test_join_where(left, right, conditions, zlice):
     q = left.join_where(right, *conditions)
 
     assert_gpu_result_equal(q, check_row_order=False)

From 3336f01cc4014e0e7d7b60be5253b17d8e6f8602 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 27 Jan 2025 10:07:31 -0600
Subject: [PATCH 16/35] increase parallelism in nightly builds (#17792)

Contributes to https://github.com/rapidsai/build-planning/issues/136

For nightly builds, some `wheel-build-{project}` jobs currently wait to start until some other `wheel-publish-{dependency}` jobs complete. This is unnecessary... `wheel-build-{dependency}` jobs will upload packages to S3, which is where `wheel-build-{project}` jobs will download them from.

This proposes changing that such that all nightly `wheel-build-*` jobs depend only other `wheel-build-*` jobs. This should decrease the end-to-end time it takes for all wheels to be built and published on nightly / branch builds.

Also updates `pre-commit` config to the latest `rapids-dependency-file-generator` version.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17792
---
 .github/workflows/build.yaml | 8 ++++----
 .pre-commit-config.yaml      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 65aebfb7f8c..f6b3fb83cdd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -90,7 +90,7 @@ jobs:
       package-name: libcudf
       package-type: cpp
   wheel-build-pylibcudf:
-    needs: [wheel-publish-libcudf]
+    needs: [wheel-build-libcudf]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -111,7 +111,7 @@ jobs:
       package-name: pylibcudf
       package-type: python
   wheel-build-cudf:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -132,7 +132,7 @@ jobs:
       package-name: cudf
       package-type: python
   wheel-build-dask-cudf:
-    needs: wheel-publish-cudf
+    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -155,7 +155,7 @@ jobs:
       package-name: dask_cudf
       package-type: python
   wheel-build-cudf-polars:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d99b74506e4..052c6cc2cb9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -173,7 +173,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.16.0
+    rev: v1.17.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]

From 32b5f24f8c6529c065d5d3cd8b293109ea6269c3 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:14:19 -0500
Subject: [PATCH 17/35] add todo comment

---
 python/cudf_polars/cudf_polars/dsl/translate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index b6353f1ac9c..640fc8d81c5 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -327,6 +327,9 @@ def translate_expr_and_maybe_fix_binop_args(translator, exprs):
 
     with set_node(translator.visitor, node.input_left):
         inp_left = translator.translate_ir(n=None)
+        # TODO: There's bug in the polars type coercion phase. Use
+        # translate_named_expr directly once it is resolved.
+        # Tracking issue: https://github.com/pola-rs/polars/issues/20935
         left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)

From 03e1f64a3678b8b24c0390a781ca99d8c2234c97 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 27 Jan 2025 10:38:22 -0600
Subject: [PATCH 18/35] Fix pre-commit.ci failures (#17819)

## Description
This PR fixes `pre-commit.ci` failures.

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 052c6cc2cb9..965b667605c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   autoupdate_branch: ""
   autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
   autoupdate_schedule: quarterly
-  skip: ["verify-alpha-spec"]
+  skip: ["verify-alpha-spec", "nbqa-isort"]
   submodules: false
 
 repos:

From 4f4f456e6444812b29f2663652c172ca12e2383d Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Mon, 27 Jan 2025 09:34:00 -0800
Subject: [PATCH 19/35] xfail/skip more tests

---
 python/cudf_polars/cudf_polars/testing/plugin.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index ccf21886e48..b145c8871be 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -122,6 +122,8 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
@@ -209,6 +211,8 @@ def pytest_configure(config: pytest.Config) -> None:
     # The test may segfault with the legacy streaming engine. We should
     # remove this skip when all polars tests use the new streaming engine.
     "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
+    # Fails in CI, but passes locally
+    "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
 }
 
 
From 025ac8ea655d3971e2b75b36f0ec84f89ebdbb2b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:17:10 -0500
Subject: [PATCH 20/35] Update python/cudf_polars/cudf_polars/testing/plugin.py

---
 python/cudf_polars/cudf_polars/testing/plugin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index b145c8871be..0b52cf1c61c 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -124,6 +124,7 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",

From e0fe51d7ad1d77459f39eebffd90bbb6539d03d8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 27 Jan 2025 18:01:24 -0800
Subject: [PATCH 21/35] Compute and use the initial string offset when building
 `nested` large string cols with chunked parquet reader (#17702)

Closes #17692.

This PR enables computing the `str_offset` required to correctly compute the offsets columns for nested large strings columns with chunked Parquet reader when `chunk_read_limit` is small resulting in multiple output table chunks per subpass.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17702
---
 .../io/parquet/parquet_reader_input.cpp       |  5 +
 .../cudf/detail/sizes_to_offsets_iterator.cuh | 10 +-
 .../cudf/strings/detail/strings_children.cuh  |  5 +-
 cpp/src/io/parquet/decode_fixed.cu            | 18 ++--
 cpp/src/io/parquet/page_delta_decode.cu       | 50 +++++-----
 cpp/src/io/parquet/page_string_utils.cuh      | 49 ++++++++--
 cpp/src/io/parquet/parquet_gpu.hpp            |  6 ++
 cpp/src/io/parquet/reader_impl.cpp            | 29 +++++-
 cpp/src/io/utilities/column_buffer.hpp        |  5 +-
 cpp/src/io/utilities/column_buffer_strings.cu |  5 +-
 cpp/src/lists/sequences.cu                    |  4 +-
 cpp/src/text/jaccard.cu                       |  2 +-
 cpp/tests/large_strings/parquet_tests.cpp     | 97 ++++++++++++++++++-
 13 files changed, 232 insertions(+), 53 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 32bd945d57c..83e6c35216a 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -121,6 +121,10 @@ void BM_parquet_read_long_strings(nvbench::state& state)
       cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);  // THIS
     auto const view = tbl->view();
 
+    // set smaller threshold to reduce file size and execution time
+    auto const threshold = 1;
+    setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
     cudf::io::parquet_writer_options write_opts =
       cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression);
@@ -129,6 +133,7 @@ void BM_parquet_read_long_strings(nvbench::state& state)
   }();
 
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
 }
 
 template <data_type DataType>
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 88ec0c07dc5..358170f76db 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -255,12 +255,14 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
  * @param begin Input iterator for scan
  * @param end End of the input iterator
  * @param result Output iterator for scan result
+ * @param initial_offset Initial offset to add to scan
  * @return The last element of the scan
  */
 template <typename SizesIterator, typename OffsetsIterator>
 auto sizes_to_offsets(SizesIterator begin,
                       SizesIterator end,
                       OffsetsIterator result,
+                      int64_t initial_offset,
                       rmm::cuda_stream_view stream)
 {
   using SizeType = typename thrust::iterator_traits<SizesIterator>::value_type;
@@ -273,7 +275,8 @@ auto sizes_to_offsets(SizesIterator begin,
     make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
   // This function uses the type of the initialization parameter as the accumulator type
   // when computing the individual scan output elements.
-  thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0});
+  thrust::exclusive_scan(
+    rmm::exec_policy_nosync(stream), begin, end, output_itr, static_cast<LastType>(initial_offset));
   return last_element.value(stream);
 }
 
@@ -319,7 +322,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
+  auto const total_elements =
+    sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, 0, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
     "Size of output exceeds the column size limit",
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index cf19baf4826..cd386ea886f 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -152,7 +152,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     cudf::detail::make_counting_transform_iterator(0, string_offsets_fn{begin, strings_count});
   // Use the sizes-to-offsets iterator to compute the total number of elements
   auto const total_bytes =
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, 0, stream);
 
   auto const threshold = cudf::strings::get_offset64_threshold();
   CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
@@ -163,7 +163,8 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
     auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+    cudf::detail::sizes_to_offsets(
+      input_itr, input_itr + strings_count + 1, d_offsets64, 0, stream);
   }
 
   return std::pair(std::move(offsets_column), total_bytes);
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index f63a4fb79b9..84f751dea6b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -942,6 +942,7 @@ constexpr bool is_split_decode()
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @param initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param error_code Error code to set if an error is encountered
  */
 template <typename level_t, int decode_block_size_t, decode_kernel_mask kernel_mask_t>
@@ -950,6 +951,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
                            size_t num_rows,
+                           cudf::device_span<size_t> initial_str_offsets,
                            kernel_error::pointer error_code)
 {
   constexpr bool has_dict_t     = has_dict<kernel_mask_t>();
@@ -1161,11 +1163,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     valid_count = next_valid_count;
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
   if constexpr (has_strings_t) {
-    if (!s->col.is_large_string_col) {
-      convert_small_string_lengths_to_offsets<decode_block_size_t, has_lists_t>(s);
+    // For large strings, update the initial string buffer offset to be used during large string
+    // column construction. Otherwise, convert string sizes to final offsets.
+    if (s->col.is_large_string_col) {
+      compute_initial_large_strings_offset(
+        s, initial_str_offsets[pages[page_idx].chunk_idx], has_lists_t);
+    } else {
+      convert_small_string_lengths_to_offsets<decode_block_size_t>(s, has_lists_t);
     }
   }
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
@@ -1185,6 +1190,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              decode_kernel_mask kernel_mask,
+                             cudf::device_span<size_t> initial_str_offsets,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream)
 {
@@ -1199,11 +1205,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
     if (level_type_size == 1) {
       gpuDecodePageDataGeneric<uint8_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+          pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
     } else {
       gpuDecodePageDataGeneric<uint16_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+          pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
     }
   };
 
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 0c9d4e77f0c..4c98a08006c 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -435,6 +435,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
                           size_t num_rows,
+                          cudf::device_span<size_t> initial_str_offsets,
                           kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
@@ -579,17 +580,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
-  if (not s->col.is_large_string_col) {
-    int value_count = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  // For large strings, update the initial string buffer offset to be used during large string
+  // column construction. Otherwise, convert string sizes to final offsets.
+  if (s->col.is_large_string_col) {
+    compute_initial_large_strings_offset(
+      s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition);
+  } else {
+    convert_small_string_lengths_to_offsets<decode_block_size>(s, has_repetition);
   }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
@@ -603,6 +600,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                                 device_span<ColumnChunkDesc const> chunks,
                                 size_t min_row,
                                 size_t num_rows,
+                                cudf::device_span<size_t> initial_str_offsets,
                                 kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
@@ -741,17 +739,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
-  if (not s->col.is_large_string_col) {
-    int value_count = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  // For large strings, update the initial string buffer offset to be used during large string
+  // column construction. Otherwise, convert string sizes to final offsets.
+  if (s->col.is_large_string_col) {
+    compute_initial_large_strings_offset(
+      s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition);
+  } else {
+    convert_small_string_lengths_to_offsets<decode_block_size>(s, has_repetition);
   }
 
   // finally, copy the string data into place
@@ -797,6 +791,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          cudf::device_span<size_t> initial_str_offsets,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream)
 {
@@ -807,10 +802,10 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
 
   if (level_type_size == 1) {
     gpuDecodeDeltaByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   } else {
     gpuDecodeDeltaByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   }
 }
 
@@ -822,6 +817,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                cudf::device_span<size_t> initial_str_offsets,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream)
 {
@@ -832,10 +828,10 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
 
   if (level_type_size == 1) {
     gpuDecodeDeltaLengthByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   } else {
     gpuDecodeDeltaLengthByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index dc4140d0a44..ba627e73625 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -20,6 +20,8 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
+#include <cuda/atomic>
+
 namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
@@ -98,21 +100,54 @@ __device__ inline void block_excl_sum(size_type* arr, size_type length, size_typ
   }
 }
 
-template <int block_size, bool has_lists>
-__device__ inline void convert_small_string_lengths_to_offsets(page_state_s* s)
+/**
+ * @brief Converts string sizes to offsets if this is not a large string column. Otherwise,
+ * atomically update the initial string offset to be used during large string column construction
+ */
+template <int block_size>
+__device__ void convert_small_string_lengths_to_offsets(page_state_s const* const state,
+                                                        bool has_lists)
 {
   // If this is a large string column. In the
   // latter case, offsets will be computed during string column creation.
-  auto& ni        = s->nesting_info[s->col.max_nesting_depth - 1];
+  auto& ni        = state->nesting_info[state->col.max_nesting_depth - 1];
   int value_count = ni.value_count;
 
   // if no repetition we haven't calculated start/end bounds and instead just skipped
   // values until we reach first_row. account for that here.
-  if constexpr (!has_lists) { value_count -= s->first_row; }
+  if (not has_lists) { value_count -= state->first_row; }
+
+  // Convert the array of lengths into offsets
+  if (value_count > 0) {
+    auto const offptr        = reinterpret_cast<size_type*>(ni.data_out);
+    auto const initial_value = state->page.str_offset;
+    block_excl_sum<block_size>(offptr, value_count, initial_value);
+  }
+}
 
-  auto const offptr        = reinterpret_cast<size_type*>(ni.data_out);
-  auto const initial_value = s->page.str_offset;
-  block_excl_sum<block_size>(offptr, value_count, initial_value);
+/**
+ * @brief Atomically update the initial string offset to be used during large string column
+ * construction
+ */
+inline __device__ void compute_initial_large_strings_offset(page_state_s const* const state,
+                                                            size_t& initial_str_offset,
+                                                            bool has_lists)
+{
+  // Values decoded by this page.
+  int value_count = state->nesting_info[state->col.max_nesting_depth - 1].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (not has_lists) { value_count -= state->first_row; }
+
+  // Atomically update the initial string offset if this is a large string column. This initial
+  // offset will be used to compute (64-bit) offsets during large string column construction.
+  if (value_count > 0 and threadIdx.x == 0) {
+    auto const initial_value = state->page.str_offset;
+    cuda::atomic_ref<size_t, cuda::std::thread_scope_device> initial_str_offsets_ref{
+      initial_str_offset};
+    initial_str_offsets_ref.fetch_min(initial_value, cuda::std::memory_order_relaxed);
+  }
 }
 
 template <int block_size>
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 4425f49d82d..a78da513b36 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -876,6 +876,7 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -884,6 +885,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          cudf::device_span<size_t> initial_str_offsets,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream);
 
@@ -898,6 +900,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -906,6 +909,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                cudf::device_span<size_t> initial_str_offsets,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream);
 
@@ -921,6 +925,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch.
+ * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -930,6 +935,7 @@ void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                     size_t min_row,
                     int level_type_size,
                     decode_kernel_mask kernel_mask,
+                    cudf::device_span<size_t> initial_str_offsets,
                     kernel_error::pointer error_code,
                     rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index dff1f3f0c0e..9dd4e19de52 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -27,6 +27,7 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
+#include <limits>
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
@@ -210,10 +211,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     }
   }
 
+  // Create an empty device vector to store the initial str offset for large string columns from for
+  // string decoders.
+  auto initial_str_offsets = rmm::device_uvector<size_t>{0, _stream, _mr};
+
   pass.chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
-  if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); }
+  if (has_strings) {
+    // Host vector to initialize the initial string offsets
+    auto host_offsets_vector =
+      cudf::detail::make_host_vector<size_t>(_input_columns.size(), _stream);
+    std::fill(
+      host_offsets_vector.begin(), host_offsets_vector.end(), std::numeric_limits<size_t>::max());
+    // Initialize the initial string offsets vector from the host vector
+    initial_str_offsets =
+      cudf::detail::make_device_uvector_async(host_offsets_vector, _stream, _mr);
+    chunk_nested_str_data.host_to_device_async(_stream);
+  }
 
   // create this before we fork streams
   kernel_error error_code(_stream);
@@ -231,6 +246,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                    skip_rows,
                    level_type_size,
                    decoder_mask,
+                   initial_str_offsets,
                    error_code.data(),
                    streams[s_idx++]);
   };
@@ -287,6 +303,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                          num_rows,
                          skip_rows,
                          level_type_size,
+                         initial_str_offsets,
                          error_code.data(),
                          streams[s_idx++]);
   }
@@ -298,6 +315,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                num_rows,
                                skip_rows,
                                level_type_size,
+                               initial_str_offsets,
                                error_code.data(),
                                streams[s_idx++]);
   }
@@ -402,6 +420,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
+  // Copy over initial string offsets from device
+  auto h_initial_str_offsets = cudf::detail::make_host_vector_async(initial_str_offsets, _stream);
+
   if (auto const error = error_code.value_sync(_stream); error != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
@@ -440,6 +461,12 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
           out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
           final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
         }
+        // Nested large strings column
+        else if (input_col.nesting_depth() > 0) {
+          CUDF_EXPECTS(h_initial_str_offsets[idx] != std::numeric_limits<size_t>::max(),
+                       "Encountered invalid initial offset for large string column");
+          out_buf.set_initial_string_offset(h_initial_str_offsets[idx]);
+        }
       }
     }
   }
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index da19539f509..5a8e3081681 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -249,6 +249,8 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
   void create_string_data(size_t num_bytes,
                           bool is_large_strings_col,
                           rmm::cuda_stream_view stream);
+  void set_initial_string_offset(size_t offset) { initial_string_offset = offset; }
+
   void* string_data() { return _string_data.data(); }
   [[nodiscard]] void const* string_data() const { return _string_data.data(); }
   [[nodiscard]] size_t string_size() const { return _string_data.size(); }
@@ -257,6 +259,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
  private:
   rmm::device_buffer _string_data{};
   bool _is_large_strings_col{};
+  size_t initial_string_offset{0};
 };
 
 using column_buffer = gather_column_buffer;
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
index 66d0a644c12..6befc078bb2 100644
--- a/cpp/src/io/utilities/column_buffer_strings.cu
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,8 @@ std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_colu
       data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr);
     auto d_offsets64 = offsets_col->mutable_view().template data<int64_t>();
     // it's safe to call with size + 1 because _data is also sized that large
-    cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream);
+    cudf::detail::sizes_to_offsets(
+      offsets_ptr, offsets_ptr + size + 1, d_offsets64, initial_string_offset, stream);
     return make_strings_column(
       size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
   } else {
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 4b50bf626f2..a98f3021da5 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,7 +167,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
   thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin);
 
   auto const n_elements = cudf::detail::sizes_to_offsets(
-    offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, stream);
+    offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, 0, stream);
   CUDF_EXPECTS(n_elements <= std::numeric_limits<size_type>::max(),
                "Size of output exceeds the column size limit",
                std::overflow_error);
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 247440212d0..58c94b60718 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -348,7 +348,7 @@ std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_subs
   count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
     *d_strings, width, offsets.data());
   auto const total_hashes =
-    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), 0, stream);
 
   // hash substrings
   rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
index 39cd783de00..5d2db84ae2e 100644
--- a/cpp/tests/large_strings/parquet_tests.cpp
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "large_strings_fixture.hpp"
 
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/concatenate.hpp>
@@ -143,3 +145,96 @@ TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings)
   // Verify that we read exactly two table chunks
   EXPECT_EQ(tables.size(), 2);
 }
+
+TEST_F(ParquetStringsTest, ChunkedReadNestedLargeStrings)
+{
+  using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using strings_col = cudf::test::strings_column_wrapper;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto constexpr num_rows = 100'000;
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  auto const int_iter = thrust::make_counting_iterator(0);
+  input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+  input_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+  auto offsets = std::vector<cudf::size_type>{};
+  offsets.reserve(num_rows * 2);
+  cudf::size_type num_structs = 0;
+  for (int i = 0; i < num_rows; ++i) {
+    offsets.push_back(num_structs);
+    auto const new_list_size = i % 4;
+    num_structs += new_list_size;
+  }
+  offsets.push_back(num_structs);
+
+  auto const make_structs_col = [=] {
+    auto child1 = int32s_col(int_iter, int_iter + num_structs);
+    auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+    auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+    return structs_col{{child1, child2, child3}}.release();
+  };
+
+  input_columns.emplace_back(
+    cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                            int32s_col(offsets.begin(), offsets.end()).release(),
+                            make_structs_col(),
+                            0,
+                            rmm::device_buffer{}));
+
+  // Input table
+  auto const table    = cudf::table{std::move(input_columns)};
+  auto const expected = table.view();
+
+  auto const child3_view = expected.column(2).child(1).child(2);  // list<struct<int,int,string>>
+  auto const column_size =
+    cudf::strings_column_view(child3_view).chars_size(cudf::get_default_stream());
+  // set smaller threshold to reduce file size and execution time
+  auto const threshold =
+    column_size / 16;  // Empirically set to get a mix of 32 and 64 bit string col chunks.
+  setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
+  // Host buffer to write Parquet
+  auto buffer = std::vector<char>{};
+  // Writer options
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected)
+      .max_page_size_bytes(512 * 1024)
+      .max_page_size_rows(20000)
+      .dictionary_policy(cudf::io::dictionary_policy::ALWAYS)
+      .write_v2_headers(false);
+
+  // Write to Parquet
+  cudf::io::write_parquet(out_opts);
+
+  // Reader options
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size()));
+
+  auto constexpr chunk_read_limit = size_t{1} * 1024 * 1024;
+  auto constexpr pass_read_limit  = 0;
+  // Chunked parquet reader
+  auto reader = cudf::io::chunked_parquet_reader(chunk_read_limit, pass_read_limit, in_opts);
+
+  // Read chunked
+  auto tables = std::vector<std::unique_ptr<cudf::table>>{};
+  while (reader.has_next()) {
+    tables.emplace_back(reader.read_chunk().tbl);
+  }
+  auto table_views = std::vector<cudf::table_view>{};
+  std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) {
+    return tbl->view();
+  });
+  auto result = cudf::concatenate(table_views);
+
+  // Verify tables to be equal
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+
+  // go back to normal threshold
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+}

From d4e94ec26a736bfcc863fb5be96c01d601789ec8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 28 Jan 2025 07:20:08 +0000
Subject: [PATCH 22/35] Implement string join in cudf-polars (#17755)

A small new string feature.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17755
---
 .../cudf_polars/dsl/expressions/string.py      | 18 ++++++++++++++++--
 .../tests/expressions/test_stringfunction.py   |  9 ++++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index a1c98a2ce1b..aa32dc66bd9 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -112,6 +112,7 @@ def __init__(
 
     def _validate_input(self):
         if self.name not in (
+            StringFunction.Name.ConcatVertical,
             StringFunction.Name.Contains,
             StringFunction.Name.EndsWith,
             StringFunction.Name.Lowercase,
@@ -125,7 +126,7 @@ def _validate_input(self):
             StringFunction.Name.StripCharsEnd,
             StringFunction.Name.Uppercase,
         ):
-            raise NotImplementedError(f"String function {self.name}")
+            raise NotImplementedError(f"String function {self.name!r}")
         if self.name is StringFunction.Name.Contains:
             literal, strict = self.options
             if not literal:
@@ -205,7 +206,20 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        if self.name is StringFunction.Name.Contains:
+        if self.name is StringFunction.Name.ConcatVertical:
+            (child,) = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+            delimiter, ignore_nulls = self.options
+            if column.obj.null_count() > 0 and not ignore_nulls:
+                return Column(plc.Column.all_null_like(column.obj, 1))
+            return Column(
+                plc.strings.combine.join_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
+                    plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
+                )
+            )
+        elif self.name is StringFunction.Name.Contains:
             child, arg = self.children
             column = child.evaluate(df, context=context, mapping=mapping)
 
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 8d7d970eb07..4d41a8c590b 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -454,3 +454,10 @@ def test_string_to_numeric_invalid(numeric_type):
         polars_except=pl.exceptions.InvalidOperationError,
         cudf_except=pl.exceptions.ComputeError,
     )
+
+
+@pytest.mark.parametrize("ignore_nulls", [False, True])
+@pytest.mark.parametrize("delimiter", ["", "/"])
+def test_string_join(ldf, ignore_nulls, delimiter):
+    q = ldf.select(pl.col("a").str.join(delimiter, ignore_nulls=ignore_nulls))
+    assert_gpu_result_equal(q)

From 87c8ab3dd3eb3c3ccfcfa5a57ca9013a5ab6dd52 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 28 Jan 2025 02:20:19 -0600
Subject: [PATCH 23/35] Apply ruff everywhere (notebooks and scripts) (#17820)

This PR applies `ruff` (`check` and `format`) everywhere, including notebooks and utility scripts. This allows us to drop our use of `nbqa`, since `ruff` natively supports notebooks. (xref: #17819, #17805)

I manually updated a few notebooks that were using old NumPy syntax for generating random values.
Closes #17461.

I also updated the `ruff` version to 0.9.3.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17820
---
 .pre-commit-config.yaml                       | 13 ++-----
 .../fetch_pandas_versions.py                  | 26 +++++++++-----
 .../pandas-tests/job-summary.py               | 30 +++++++++++-----
 ci/utils/nbtestlog2junitxml.py                | 34 +++++++++---------
 cpp/scripts/sort_ninja_log.py                 | 23 +++++++-----
 docs/cudf/source/_ext/PandasCompat.py         | 27 ++++++++------
 docs/cudf/source/conf.py                      | 13 +++----
 docs/cudf/source/user_guide/10min.ipynb       |  3 +-
 .../cudf/source/user_guide/cupy-interop.ipynb |  5 +--
 .../source/user_guide/guide-to-udfs.ipynb     |  7 ++--
 .../cudf/source/user_guide/missing-data.ipynb | 11 +++---
 .../performance-comparisons.ipynb             | 29 +++++++--------
 docs/dask_cudf/source/conf.py                 |  6 ++--
 pyproject.toml                                | 13 +++----
 python/cudf/benchmarks/common/utils.py        |  6 ++--
 python/cudf/benchmarks/conftest.py            |  8 ++---
 python/cudf/cudf/core/_base_index.py          |  4 +--
 python/cudf/cudf/core/column/categorical.py   |  6 ++--
 python/cudf/cudf/core/column/column.py        |  4 +--
 python/cudf/cudf/core/column/datetime.py      |  4 +--
 python/cudf/cudf/core/column/lists.py         |  2 +-
 python/cudf/cudf/core/column/string.py        |  6 ++--
 python/cudf/cudf/core/column_accessor.py      |  4 +--
 python/cudf/cudf/core/dataframe.py            | 15 ++++----
 python/cudf/cudf/core/df_protocol.py          |  5 ++-
 python/cudf/cudf/core/frame.py                |  6 ++--
 python/cudf/cudf/core/groupby/groupby.py      | 10 +++---
 python/cudf/cudf/core/index.py                | 12 +++----
 python/cudf/cudf/core/indexing_utils.py       | 11 +++---
 python/cudf/cudf/core/mixins/mixin_factory.py |  8 ++---
 python/cudf/cudf/core/reshape.py              |  6 ++--
 python/cudf/cudf/core/scalar.py               |  5 +--
 python/cudf/cudf/core/series.py               | 35 +++++++++----------
 python/cudf/cudf/core/subword_tokenizer.py    |  5 ++-
 python/cudf/cudf/io/json.py                   |  2 +-
 python/cudf/cudf/io/parquet.py                |  4 +--
 python/cudf/cudf/options.py                   |  8 ++---
 python/cudf/cudf/tests/test_array_ufunc.py    |  5 ++-
 python/cudf/cudf/tests/test_csv.py            | 22 +++++++-----
 .../cudf/tests/test_cuda_array_interface.py   |  5 ++-
 python/cudf/cudf/tests/test_json.py           | 17 +++++----
 python/cudf/cudf/tests/test_parquet.py        |  6 ++--
 python/cudf/cudf/tests/test_scalar.py         |  4 +--
 python/cudf/cudf/tests/test_series.py         |  4 +--
 python/cudf/cudf/tests/test_unaops.py         |  5 ++-
 python/cudf/cudf/utils/dtypes.py              |  9 +++--
 python/cudf/cudf/utils/hash_vocab_utils.py    |  8 ++---
 python/cudf/cudf/utils/ioutils.py             |  8 ++---
 .../cudf/cudf/utils/performance_tracking.py   |  8 ++---
 python/cudf/cudf/utils/queryutils.py          |  5 ++-
 .../data/repr_slow_down_test.ipynb            |  6 ++--
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 ++---
 .../cudf_polars/testing/asserts.py            |  8 ++---
 .../pylibcudf/pylibcudf/tests/common/utils.py | 14 ++++----
 54 files changed, 278 insertions(+), 280 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 965b667605c..718353d48e9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   autoupdate_branch: ""
   autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
   autoupdate_schedule: quarterly
-  skip: ["verify-alpha-spec", "nbqa-isort"]
+  skip: ["verify-alpha-spec"]
   submodules: false
 
 repos:
@@ -41,13 +41,6 @@ repos:
                "python/cudf_polars/cudf_polars",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
-  - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.9.1
-    hooks:
-      - id: nbqa-isort
-        # Use the cudf_kafka isort orderings in notebooks so that dask
-        # and RAPIDS packages have their own sections.
-        args: ["--settings-file=python/cudf_kafka/pyproject.toml"]
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:
@@ -153,13 +146,11 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.0
+    rev: v0.9.3
     hooks:
       - id: ruff
         args: ["--fix"]
-        files: python/.*$
       - id: ruff-format
-        files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
     rev: v0.4.0
     hooks:
diff --git a/ci/cudf_pandas_scripts/fetch_pandas_versions.py b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
index b6913f947e8..ab72d25400a 100644
--- a/ci/cudf_pandas_scripts/fetch_pandas_versions.py
+++ b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
@@ -1,24 +1,34 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+
+import argparse
 
 import requests
-from packaging.version import Version
 from packaging.specifiers import SpecifierSet
-import argparse
+from packaging.version import Version
+
 
 def get_pandas_versions(pandas_range):
     url = "https://pypi.org/pypi/pandas/json"
     response = requests.get(url)
     data = response.json()
-    versions = [Version(v) for v in data['releases']]
+    versions = [Version(v) for v in data["releases"]]
     specifier = SpecifierSet(pandas_range.lstrip("pandas"))
     matching_versions = [v for v in versions if v in specifier]
-    matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version)
+    matching_minors = sorted(
+        set(".".join((str(v.major), str(v.minor))) for v in matching_versions),
+        key=Version,
+    )
     return matching_minors
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.")
-    parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.")
+    parser = argparse.ArgumentParser(
+        description="Filter pandas versions by prefix."
+    )
+    parser.add_argument(
+        "pandas_range", type=str, help="The version prefix to filter by."
+    )
     args = parser.parse_args()
 
     versions = get_pandas_versions(args.pandas_range)
-    print(','.join(versions))
+    print(",".join(versions))
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 485b2ac8a51..af3e28f440f 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -68,17 +68,27 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"]
-main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1)
-main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+main_df["CPU Usage"] = (
+    (main_df["_slow_function_call"] / total_usage) * 100.0
+).round(1)
+main_df["GPU Usage"] = (
+    (main_df["_fast_function_call"] / total_usage) * 100.0
+).round(1)
 
 total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"]
-pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1)
-pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1)
+pr_df["CPU Usage"] = (
+    (pr_df["_slow_function_call"] / total_usage) * 100.0
+).round(1)
+pr_df["GPU Usage"] = (
+    (pr_df["_fast_function_call"] / total_usage) * 100.0
+).round(1)
 
 cpu_usage_mean = pr_df["CPU Usage"].mean().round(2)
 gpu_usage_mean = pr_df["GPU Usage"].mean().round(2)
 
-gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean())
+gpu_usage_rate_change = abs(
+    pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean()
+)
 pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0)
 pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0)
 main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0)
@@ -92,8 +102,12 @@ def emoji_failed(x):
 pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%"
 pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%"
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
-diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]]
+pr_df = pr_df[
+    ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
+]
+diff_df = diff_df[
+    ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]
+]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
 diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed)
diff --git a/ci/utils/nbtestlog2junitxml.py b/ci/utils/nbtestlog2junitxml.py
index 14384af3225..91ca083337e 100644
--- a/ci/utils/nbtestlog2junitxml.py
+++ b/ci/utils/nbtestlog2junitxml.py
@@ -1,15 +1,16 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 # Generate a junit-xml file from parsing a nbtest log
 
 import re
-from xml.etree.ElementTree import Element, ElementTree
-from os import path
 import string
 from enum import Enum
-
+from os import path
+from xml.etree.ElementTree import Element, ElementTree
 
 startingPatt = re.compile(r"^STARTING: ([\w\.\-]+)$")
-skippingPatt = re.compile(r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$")
+skippingPatt = re.compile(
+    r"^SKIPPING: ([\w\.\-]+)\s*(\(([\w\.\-\ \,]+)\))?\s*$"
+)
 exitCodePatt = re.compile(r"^EXIT CODE: (\d+)$")
 folderPatt = re.compile(r"^FOLDER: ([\w\.\-]+)$")
 timePatt = re.compile(r"^real\s+([\d\.ms]+)$")
@@ -37,12 +38,8 @@ def makeFailureElement(outputLines):
 
 
 def setFileNameAttr(attrDict, fileName):
-    attrDict.update(file=fileName,
-                    classname="",
-                    line="",
-                    name="",
-                    time=""
-                   )
+    attrDict.update(file=fileName, classname="", line="", name="", time="")
+
 
 def setClassNameAttr(attrDict, className):
     attrDict["classname"] = className
@@ -76,11 +73,12 @@ def parseLog(logFile, testSuiteElement):
         testSuiteElement.attrib["timestamp"] = ""
 
         attrDict = {}
-        #setFileNameAttr(attrDict, logFile)
+        # setFileNameAttr(attrDict, logFile)
         setFileNameAttr(attrDict, "nbtest")
 
-        parserStateEnum = Enum("parserStateEnum",
-                               "newTest startingLine finishLine exitCode")
+        parserStateEnum = Enum(
+            "parserStateEnum", "newTest startingLine finishLine exitCode"
+        )
         parserState = parserStateEnum.newTest
 
         testOutput = ""
@@ -98,7 +96,9 @@ def parseLog(logFile, testSuiteElement):
                     setTimeAttr(attrDict, "0m0s")
                     skippedElement = makeTestCaseElement(attrDict)
                     message = m.group(3) or ""
-                    skippedElement.append(Element("skipped", message=message, type=""))
+                    skippedElement.append(
+                        Element("skipped", message=message, type="")
+                    )
                     testSuiteElement.append(skippedElement)
                     incrNumAttr(testSuiteElement, "skipped")
                     incrNumAttr(testSuiteElement, "tests")
@@ -160,4 +160,6 @@ def parseLog(logFile, testSuiteElement):
     testSuiteElement = Element("testsuite", name="nbtest", hostname="")
     parseLog(sys.argv[1], testSuiteElement)
     testSuitesElement.append(testSuiteElement)
-    ElementTree(testSuitesElement).write(sys.argv[1]+".xml", xml_declaration=True)
+    ElementTree(testSuitesElement).write(
+        sys.argv[1] + ".xml", xml_declaration=True
+    )
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index e111367d191..761bb3e7fb1 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -4,13 +4,12 @@
 import argparse
 import os
 import re
-import sys
-import xml.etree.ElementTree as ET
 from pathlib import Path
-from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
-parser.add_argument("log_file", type=str, default=".ninja_log", help=".ninja_log file")
+parser.add_argument(
+    "log_file", type=str, default=".ninja_log", help=".ninja_log file"
+)
 parser.add_argument(
     "--fmt",
     type=str,
@@ -146,8 +145,8 @@ def format_file_size(input_size):
 
 
 def replace_placeholder_patterns(input_string: str) -> str:
-    pattern = r'(_h_env_placehold)[_placehold]+'
-    return re.sub(pattern, r'\1...', input_string)
+    pattern = r"(_h_env_placehold)[_placehold]+"
+    return re.sub(pattern, r"\1...", input_string)
 
 
 # adjust name for display
@@ -262,7 +261,9 @@ def output_html(entries, sorted_list, cmp_entries, args):
 
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
-    print("<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep="")
+    print(
+        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
+    )
     if cmp_entries:
         print("<th>t-cmp</th>", sep="")
     print("</tr>")
@@ -282,7 +283,9 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
         print("<td align='right'>", file_size_str, "</td>", sep="", end="")
         # output diff column
-        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
             diff_time_str = format_build_time(diff_time)
@@ -347,7 +350,9 @@ def output_csv(entries, sorted_list, cmp_entries, args):
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
         print(build_time, file_size, name, sep=",", end="")
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
diff --git a/docs/cudf/source/_ext/PandasCompat.py b/docs/cudf/source/_ext/PandasCompat.py
index 331495c981e..ad6df263f54 100644
--- a/docs/cudf/source/_ext/PandasCompat.py
+++ b/docs/cudf/source/_ext/PandasCompat.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION
+# Copyright (c) 2021-2025, NVIDIA CORPORATION
 
 # This file is adapted from official sphinx tutorial for `todo` extension:
 # https://www.sphinx-doc.org/en/master/development/tutorials/todo.html
@@ -7,7 +7,6 @@
 from typing import cast
 
 from docutils import nodes
-from docutils.nodes import Element
 from docutils.parsers.rst import Directive
 from docutils.parsers.rst.directives.admonitions import BaseAdmonition
 from sphinx import addnodes
@@ -39,7 +38,6 @@ def run(self):
 
 
 class PandasCompatDirective(BaseAdmonition, SphinxDirective):
-
     # this enables content in the directive
     has_content = True
 
@@ -119,18 +117,24 @@ def __init__(self, app, doctree, docname):
         self.builder = app.builder
         self.config = app.config
         self.env = app.env
-        self.domain = cast(PandasCompatDomain, app.env.get_domain("pandascompat"))
+        self.domain = cast(
+            PandasCompatDomain, app.env.get_domain("pandascompat")
+        )
         self.document = new_document("")
         self.process(doctree, docname)
 
     def process(self, doctree: nodes.document, docname: str) -> None:
-        pandascompats = [v for vals in self.domain.pandascompats.values() for v in vals]
+        pandascompats = [
+            v for vals in self.domain.pandascompats.values() for v in vals
+        ]
         for node in doctree.findall(PandasCompatList):
             if not self.config.include_pandas_compat:
                 node.parent.remove(node)
                 continue
 
-            content: list[Element | None] = [nodes.target()] if node.get("ids") else []
+            content: list[nodes.Element | None] = (
+                [nodes.target()] if node.get("ids") else []
+            )
 
             for pandascompat in pandascompats:
                 # Create a copy of the pandascompat node
@@ -149,13 +153,16 @@ def create_reference(self, pandascompat, docname):
         para = nodes.paragraph()
         newnode = nodes.reference("", "")
         innernode = nodes.emphasis(
-            get_translation_sphinx("[source]"), get_translation_sphinx("[source]")
+            get_translation_sphinx("[source]"),
+            get_translation_sphinx("[source]"),
         )
         newnode["refdocname"] = pandascompat["docname"]
         try:
-            newnode["refuri"] = self.builder.get_relative_uri(
-                docname, pandascompat["docname"]
-            ) + "#" + pandascompat["target"]["refid"]
+            newnode["refuri"] = (
+                self.builder.get_relative_uri(docname, pandascompat["docname"])
+                + "#"
+                + pandascompat["target"]["refid"]
+            )
         except NoUri:
             # ignore if no URI can be determined, e.g. for LaTeX output
             pass
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 09214803c0c..ac34c10d22f 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # cudf documentation build configuration file, created by
 # sphinx-quickstart on Wed May  3 10:59:22 2017.
@@ -36,7 +36,7 @@
 from pygments.token import Text as PText
 from sphinx.addnodes import pending_xref
 from sphinx.ext import intersphinx
-from sphinx.ext.autodoc import ClassDocumenter, bool_option
+from sphinx.ext.autodoc import ClassDocumenter
 from sphinx.highlighting import lexers
 
 
@@ -694,15 +694,16 @@ def add_content(self, more_content) -> None:
         enum_object: IntEnum = self.object
 
         if self.object.__name__ != "Kind":
-            self.add_line(f"See also :cpp:enum:`cudf::{self.object.__name__}`.", source_name)
+            self.add_line(
+                f"See also :cpp:enum:`cudf::{self.object.__name__}`.",
+                source_name,
+            )
         self.add_line("", source_name)
         self.add_line("Enum members", source_name)
         self.add_line("", source_name)
 
         for the_member_name in enum_object.__members__:  # type: ignore[attr-defined]
-            self.add_line(
-                f"* ``{the_member_name}``", source_name
-            )
+            self.add_line(f"* ``{the_member_name}``", source_name)
             self.add_line("", source_name)
 
 
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 46221b6015b..17b3bab0278 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -37,12 +37,11 @@
    "source": [
     "import os\n",
     "\n",
+    "import cudf\n",
     "import cupy as cp\n",
     "import dask_cudf\n",
     "import pandas as pd\n",
     "\n",
-    "import cudf\n",
-    "\n",
     "cp.random.seed(12)\n",
     "\n",
     "#### Portions of this were borrowed and adapted from the\n",
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index c5b1210a2c7..112f0bcfca6 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -17,13 +17,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import timeit\n",
-    "\n",
+    "import cudf\n",
     "import cupy as cp\n",
     "from packaging import version\n",
     "\n",
-    "import cudf\n",
-    "\n",
     "if version.parse(cp.__version__) >= version.parse(\"10.0.0\"):\n",
     "    cupy_from_dlpack = cp.from_dlpack\n",
     "else:\n",
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index abfe5a1b178..07a7ec997d6 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -15,13 +15,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import cudf\n",
     "import numpy as np\n",
+    "from cudf.datasets import randomdata\n",
     "from numba import config\n",
     "\n",
-    "config.CUDA_LOW_OCCUPANCY_WARNINGS = 0\n",
-    "\n",
-    "import cudf\n",
-    "from cudf.datasets import randomdata"
+    "config.CUDA_LOW_OCCUPANCY_WARNINGS = 0"
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb
index f1404ce0b77..b4f1f43ee51 100644
--- a/docs/cudf/source/user_guide/missing-data.ipynb
+++ b/docs/cudf/source/user_guide/missing-data.ipynb
@@ -39,9 +39,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import cudf\n",
     "import numpy as np\n",
     "\n",
-    "import cudf"
+    "rng = np.random.default_rng()"
    ]
   },
   {
@@ -1709,7 +1710,9 @@
    "source": [
     "import cupy as cp\n",
     "\n",
-    "dff = cudf.DataFrame(cp.random.randn(10, 3), columns=list(\"ABC\"))"
+    "cp_rng = cp.random.default_rng()\n",
+    "\n",
+    "dff = cudf.DataFrame(cp_rng.standard_normal((10, 3)), columns=list(\"ABC\"))"
    ]
   },
   {
@@ -3177,7 +3180,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = cudf.DataFrame(cp.random.randn(10, 2))"
+    "df = cudf.DataFrame(cp_rng.standard_normal((10, 2)))"
    ]
   },
   {
@@ -3187,7 +3190,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df[np.random.rand(df.shape[0]) > 0.5] = 1.5"
+    "df[rng.random(df.shape[0]) > 0.5] = 1.5"
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/performance-comparisons/performance-comparisons.ipynb b/docs/cudf/source/user_guide/performance-comparisons/performance-comparisons.ipynb
index d9df99bf16a..5ba26449437 100644
--- a/docs/cudf/source/user_guide/performance-comparisons/performance-comparisons.ipynb
+++ b/docs/cudf/source/user_guide/performance-comparisons/performance-comparisons.ipynb
@@ -36,17 +36,13 @@
     }
    ],
    "source": [
-    "import os\n",
-    "import time\n",
     "import timeit\n",
-    "from io import BytesIO\n",
     "\n",
+    "import cudf\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "import cudf\n",
-    "\n",
     "print(f\"{cudf.__version__=}\")"
    ]
   },
@@ -58,7 +54,7 @@
    },
    "outputs": [],
    "source": [
-    "np.random.seed(0)"
+    "rng = np.random.default_rng(seed=0)"
    ]
   },
   {
@@ -199,8 +195,8 @@
    "source": [
     "pdf = pd.DataFrame(\n",
     "    {\n",
-    "        \"numbers\": np.random.randint(-1000, 1000, num_rows, dtype=\"int64\"),\n",
-    "        \"business\": np.random.choice(\n",
+    "        \"numbers\": rng.integers(-1000, 1000, num_rows, dtype=\"int64\"),\n",
+    "        \"business\": rng.choice(\n",
     "            [\"McD\", \"Buckees\", \"Walmart\", \"Costco\"], size=num_rows\n",
     "        ),\n",
     "    }\n",
@@ -429,10 +425,10 @@
    "source": [
     "pdf = pd.DataFrame(\n",
     "    {\n",
-    "        \"numbers\": np.random.randint(\n",
+    "        \"numbers\": rng.integers(\n",
     "            -1000, 1000, int(sub_sample / 10), dtype=\"int64\"\n",
     "        ),\n",
-    "        \"business\": np.random.choice(\n",
+    "        \"business\": rng.choice(\n",
     "            [\"McD\", \"Buckees\", \"Walmart\", \"Costco\"], size=int(sub_sample / 10)\n",
     "        ),\n",
     "    }\n",
@@ -583,10 +579,11 @@
    },
    "outputs": [],
    "source": [
+    "import gc\n",
+    "\n",
     "# Cleaning up used memory for later benchmarks\n",
     "del pdf\n",
     "del gdf\n",
-    "import gc\n",
     "\n",
     "_ = gc.collect()"
    ]
@@ -617,7 +614,7 @@
    "outputs": [],
    "source": [
     "pd_series = pd.Series(\n",
-    "    np.random.choice(\n",
+    "    rng.choice(\n",
     "        [\"123\", \"56.234\", \"Walmart\", \"Costco\", \"rapids ai\"], size=num_rows\n",
     "    )\n",
     ")"
@@ -899,7 +896,7 @@
    "source": [
     "pdf_age = pd.DataFrame(\n",
     "    {\n",
-    "        \"age\": np.random.randint(0, 100, num_rows),\n",
+    "        \"age\": rng.integers(0, 100, num_rows),\n",
     "    }\n",
     ")\n",
     "pdf_age"
@@ -1093,7 +1090,7 @@
    ],
    "source": [
     "pd_series = pd.Series(\n",
-    "    np.random.choice([\"ABC\", \"abc\", \"hello world\", \"AI\"], size=num_rows),\n",
+    "    rng.choice([\"ABC\", \"abc\", \"hello world\", \"AI\"], size=num_rows),\n",
     "    name=\"strings\",\n",
     ")\n",
     "pd_series"
@@ -1405,8 +1402,8 @@
    "outputs": [],
    "source": [
     "pdf = pd.DataFrame()\n",
-    "pdf[\"key\"] = np.random.randint(0, 2, num_rows)\n",
-    "pdf[\"val\"] = np.random.randint(0, 7, num_rows)\n",
+    "pdf[\"key\"] = rng.integers(0, 2, num_rows)\n",
+    "pdf[\"val\"] = rng.integers(0, 7, num_rows)\n",
     "\n",
     "\n",
     "def custom_formula_udf(df):\n",
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index 5daa8245695..99133abc8f2 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 # Configuration file for the Sphinx documentation builder.
 #
@@ -10,10 +10,8 @@
 
 import datetime
 
-from packaging.version import Version
-
 import dask_cudf
-
+from packaging.version import Version
 
 DASK_CUDF_VERSION = Version(dask_cudf.__version__)
 
diff --git a/pyproject.toml b/pyproject.toml
index 0c95ea60408..c906b9998e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 [tool.mypy]
 ignore_missing_imports = true
@@ -116,17 +116,12 @@ ignore = [
 ]
 fixable = ["ALL"]
 exclude = [
-    # TODO: https://github.com/rapidsai/cudf/issues/17461
-    "**/*.ipynb",
-]
-
-[tool.ruff.format]
-exclude = [
-    # TODO: https://github.com/rapidsai/cudf/issues/17461
-    "**/*.ipynb",
+    "cpp/scripts/gdb-pretty-printers.py",
 ]
 
 [tool.ruff.lint.per-file-ignores]
+# We use "== None" to demonstrate null handling in this notebook
+"docs/cudf/source/user_guide/missing-data.ipynb" = ["E711"]
 # Lots of pytest implicitly injected attributes in conftest-patch.py
 "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"]
 "python/cudf/cudf/pandas/scripts/*" = ["D"]
diff --git a/python/cudf/benchmarks/common/utils.py b/python/cudf/benchmarks/common/utils.py
index 363316f0930..b0643899b63 100644
--- a/python/cudf/benchmarks/common/utils.py
+++ b/python/cudf/benchmarks/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 """Common utilities for fixture creation and benchmarking."""
 
@@ -101,13 +101,13 @@ def bench_columns(benchmark, df):
         "frame_or_index",
     )
     assert cls in supported_classes, (
-        f"cls {cls} is invalid, choose from " f"{', '.join(supported_classes)}"
+        f"cls {cls} is invalid, choose from {', '.join(supported_classes)}"
     )
 
     if not isinstance(dtype, list):
         dtype = [dtype]
     assert all(dt in column_generators for dt in dtype), (
-        f"The only supported dtypes are " f"{', '.join(column_generators)}"
+        f"The only supported dtypes are {', '.join(column_generators)}"
     )
 
     dtype_str = "_dtype_" + "_or_".join(dtype)
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 24ff211387c..7561bdc41b4 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 """Defines pytest fixtures for all benchmarks.
 
@@ -83,9 +83,9 @@ def axis(request):
 for dtype, column_generator in column_generators.items():
 
     def make_dataframe(nr, nc, column_generator=column_generator):
-        assert nc <= len(
-            string.ascii_lowercase
-        ), "make_dataframe only supports a maximum of 26 columns"
+        assert nc <= len(string.ascii_lowercase), (
+            "make_dataframe only supports a maximum of 26 columns"
+        )
         return cudf.DataFrame(
             {
                 f"{string.ascii_lowercase[i]}": column_generator(nr)
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 657985dab5e..a5e1e88c960 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -322,11 +322,11 @@ def get_level_values(self, level):
         elif is_integer(level):
             if level != 0:
                 raise IndexError(
-                    f"Cannot get level: {level} " f"for index with 1 level"
+                    f"Cannot get level: {level} for index with 1 level"
                 )
             return self
         else:
-            raise KeyError(f"Requested level with name {level} " "not found")
+            raise KeyError(f"Requested level with name {level} not found")
 
     @property
     def names(self):
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 6f241a50a4e..9be47107b14 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1193,8 +1193,7 @@ def _concat(
         newsize = sum(map(len, codes))
         if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
             raise MemoryError(
-                f"Result of concat cannot have "
-                f"size > {SIZE_TYPE_DTYPE}_MAX"
+                f"Result of concat cannot have size > {SIZE_TYPE_DTYPE}_MAX"
             )
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype)
@@ -1443,8 +1442,7 @@ def reorder_categories(
         # current set of categories.
         if not self._categories_equal(new_categories, ordered=False):
             raise ValueError(
-                "items in new_categories are not the same as in "
-                "old categories"
+                "items in new_categories are not the same as in old categories"
             )
         return self._set_categories(new_categories, ordered=ordered)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d798dbdac35..be0758041f4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2051,7 +2051,7 @@ def as_column(
             )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:
             dtype = cudf.dtype(
-                f'i{cudf.get_option("default_integer_bitwidth")//8}'
+                f"i{cudf.get_option('default_integer_bitwidth') // 8}"
             )
         if dtype is not None:
             return column.astype(dtype)
@@ -2600,7 +2600,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     newsize = sum(map(len, objs))
     if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
         raise MemoryError(
-            f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX"
+            f"Result of concat cannot have size > {SIZE_TYPE_DTYPE}_MAX"
         )
     elif newsize == 0:
         return column_empty(0, head.dtype)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 8b28c372d2f..3649e9830de 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -1087,9 +1087,7 @@ def __repr__(self):
             pa.timestamp(self.dtype.unit, str(self.dtype.tz))
         )
         return (
-            f"{object.__repr__(self)}\n"
-            f"{arr.to_string()}\n"
-            f"dtype: {self.dtype}"
+            f"{object.__repr__(self)}\n{arr.to_string()}\ndtype: {self.dtype}"
         )
 
     def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"):
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 2b834a20726..361da7d3be3 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -714,7 +714,7 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError("lists_indices should be list type array.")
         if not lists_indices_col.size == self._column.size:
             raise ValueError(
-                "lists_indices and list column is of different " "size."
+                "lists_indices and list column is of different size."
             )
         if (
             not _is_non_decimal_numeric_dtype(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 54b42b1f6de..1c82fa28d4e 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2647,8 +2647,7 @@ def split(
 
         if expand not in (True, False):
             raise ValueError(
-                f"expand parameter accepts only : [True, False], "
-                f"got {expand}"
+                f"expand parameter accepts only : [True, False], got {expand}"
             )
 
         # Pandas treats 0 as all
@@ -2828,8 +2827,7 @@ def rsplit(
 
         if expand not in (True, False):
             raise ValueError(
-                f"expand parameter accepts only : [True, False], "
-                f"got {expand}"
+                f"expand parameter accepts only : [True, False], got {expand}"
             )
 
         # Pandas treats 0 as all
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index aaf7d071dff..5fd2e8d891f 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -703,7 +703,7 @@ def rename_column(x):
                 level = 0
             if level != 0:
                 raise IndexError(
-                    f"Too many levels: Index has only 1 level, not {level+1}"
+                    f"Too many levels: Index has only 1 level, not {level + 1}"
                 )
 
             if isinstance(mapper, Mapping):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 97ac1708f8e..bbd20aa05fc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3804,8 +3804,7 @@ def agg(self, aggs, axis=None):
         elif isinstance(aggs, str):
             if not hasattr(self, aggs):
                 raise AttributeError(
-                    f"{aggs} is not a valid function for "
-                    f"'DataFrame' object"
+                    f"{aggs} is not a valid function for 'DataFrame' object"
                 )
             result = DataFrame()
             result[aggs] = getattr(self, aggs)()
@@ -6650,9 +6649,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
             return DataFrame()
 
         with warnings.catch_warnings():
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warnings.simplefilter("ignore", FutureWarning)
             df = cudf.concat(mode_results, axis=1)
 
@@ -7707,9 +7706,9 @@ def pct_change(
 
         if fill_method not in (no_default, None) or limit is not no_default:
             # Do not remove until pandas 3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warnings.warn(
                 "The 'fill_method' and 'limit' keywords in "
                 f"{type(self).__name__}.pct_change are deprecated and will be "
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index a798041699e..cc9f39d70ef 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import enum
@@ -419,8 +419,7 @@ def _get_validity_buffer(
 
         elif null == _MaskKind.NAN:
             raise RuntimeError(
-                "This column uses NaN as null "
-                "so does not have a separate mask"
+                "This column uses NaN as null so does not have a separate mask"
             )
         elif null == _MaskKind.NON_NULLABLE:
             raise RuntimeError(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d8373541e2a..08f8e49a98c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -778,9 +778,9 @@ def fillna(
 
         if method:
             # Do not remove until pandas 3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warnings.warn(
                 f"{type(self).__name__}.fillna with 'method' is "
                 "deprecated and will raise in a future version. "
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 081cbce2098..9624d33bf62 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2915,15 +2915,13 @@ def pct_change(
         if freq is not None:
             raise NotImplementedError("freq parameter not supported yet.")
         elif fill_method not in {no_default, None, "ffill", "bfill"}:
-            raise ValueError(
-                "fill_method must be one of 'ffill', or" "'bfill'."
-            )
+            raise ValueError("fill_method must be one of 'ffill', or'bfill'.")
 
         if fill_method not in (no_default, None) or limit is not no_default:
             # Do not remove until pandas 3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warnings.warn(
                 "The 'fill_method' keyword being not None and the 'limit' "
                 f"keywords in {type(self).__name__}.pct_change are "
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index c13d62b39df..b439fcafb89 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -261,9 +261,9 @@ def searchsorted(
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
     ):
-        assert (len(self) <= 1) or (
-            ascending == (self.step > 0)
-        ), "Invalid ascending flag"
+        assert (len(self) <= 1) or (ascending == (self.step > 0)), (
+            "Invalid ascending flag"
+        )
         return search_range(value, self._range, side=side)
 
     def factorize(
@@ -1217,9 +1217,9 @@ def _concat(cls, objs):
         non_empties = [index for index in objs if len(index)]
         if len(objs) != len(non_empties):
             # Do not remove until pandas-3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warning_msg = (
                 "The behavior of array concatenation with empty entries is "
                 "deprecated. In a future version, this will no longer exclude "
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index ce6a5c960dd..c8e9b9be69b 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -152,13 +152,12 @@ def destructure_dataframe_iloc_indexer(
         )
     except TypeError:
         raise TypeError(
-            "Column indices must be integers, slices, "
-            "or list-like of integers"
+            "Column indices must be integers, slices, or list-like of integers"
         )
     if scalar:
-        assert (
-            len(column_names) == 1
-        ), "Scalar column indexer should not produce more than one column"
+        assert len(column_names) == 1, (
+            "Scalar column indexer should not produce more than one column"
+        )
 
     return rows, (scalar, column_names)
 
diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py
index 7bbb299d643..9cf845eadc4 100644
--- a/python/cudf/cudf/core/mixins/mixin_factory.py
+++ b/python/cudf/cudf/core/mixins/mixin_factory.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import inspect
 
@@ -224,9 +224,9 @@ def __init_subclass__(cls):
                 valid_operations |= getattr(base_cls, validity_attr, set())
 
             invalid_operations = valid_operations - supported_operations
-            assert (
-                len(invalid_operations) == 0
-            ), f"Invalid requested operations: {invalid_operations}"
+            assert len(invalid_operations) == 0, (
+                f"Invalid requested operations: {invalid_operations}"
+            )
 
             base_operation = getattr(cls, base_operation_name)
             for operation in valid_operations:
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index eedd777aafe..36cbb196ec0 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -378,9 +378,9 @@ def concat(
         any_empty = any(obj.empty for obj in objs)
         if any_empty:
             # Do not remove until pandas-3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warnings.warn(
                 "The behavior of array concatenation with empty entries is "
                 "deprecated. In a future version, this will no longer exclude "
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 19b13a8e97d..df825a8eaef 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -466,10 +466,7 @@ def __neg__(self):
     def __repr__(self) -> str:
         # str() fixes a numpy bug with NaT
         # https://github.com/numpy/numpy/issues/17552
-        return (
-            f"{self.__class__.__name__}"
-            f"({self.value!s}, dtype={self.dtype})"
-        )
+        return f"{self.__class__.__name__}({self.value!s}, dtype={self.dtype})"
 
     def _binop_result_dtype_or_error(self, other, op):
         if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 60e0cd38483..6a50d5da523 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -230,8 +230,7 @@ def __setitem__(self, key, value):
                 or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b")
             ):
                 raise MixedTypeError(
-                    f"Cannot assign {value=} to "
-                    f"bool dtype={self._frame.dtype}"
+                    f"Cannot assign {value=} to bool dtype={self._frame.dtype}"
                 )
         elif not (
             isinstance(value, (list, dict))
@@ -254,9 +253,9 @@ def __setitem__(self, key, value):
             value = value.astype(to_dtype)
             if to_dtype != self._frame.dtype:
                 # Do not remove until pandas-3.0 support is added.
-                assert (
-                    PANDAS_LT_300
-                ), "Need to drop after pandas-3.0 support is added."
+                assert PANDAS_LT_300, (
+                    "Need to drop after pandas-3.0 support is added."
+                )
                 warnings.warn(
                     f"Setting an item of incompatible dtype is deprecated "
                     "and will raise in a future error of pandas. "
@@ -365,16 +364,16 @@ def _loc_to_iloc(self, arg):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
                 if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu":
                     # Do not remove until pandas 3.0 support is added.
-                    assert (
-                        PANDAS_LT_300
-                    ), "Need to drop after pandas-3.0 support is added."
+                    assert PANDAS_LT_300, (
+                        "Need to drop after pandas-3.0 support is added."
+                    )
                     warnings.warn(warn_msg, FutureWarning)
                     return arg.value
                 elif is_integer(arg):
                     # Do not remove until pandas 3.0 support is added.
-                    assert (
-                        PANDAS_LT_300
-                    ), "Need to drop after pandas-3.0 support is added."
+                    assert PANDAS_LT_300, (
+                        "Need to drop after pandas-3.0 support is added."
+                    )
                     warnings.warn(warn_msg, FutureWarning)
                     return arg
             try:
@@ -1156,8 +1155,7 @@ def reset_index(
     ):
         if not drop and inplace:
             raise TypeError(
-                "Cannot reset_index inplace on a Series "
-                "to create a DataFrame"
+                "Cannot reset_index inplace on a Series to create a DataFrame"
             )
         data, index = self._reset_index(
             level=level, drop=drop, allow_duplicates=allow_duplicates
@@ -1361,8 +1359,7 @@ def map(self, arg, na_action=None) -> "Series":
         elif isinstance(arg, cudf.Series):
             if not arg.index.is_unique:
                 raise ValueError(
-                    "Reindexing only valid with"
-                    " uniquely valued Index objects"
+                    "Reindexing only valid with uniquely valued Index objects"
                 )
             lhs = cudf.DataFrame(
                 {"x": self, "orig_order": as_column(range(len(self)))}
@@ -3362,7 +3359,7 @@ def describe(
         if percentiles is not None:
             if not all(0 <= x <= 1 for x in percentiles):
                 raise ValueError(
-                    "All percentiles must be between 0 and 1, " "inclusive."
+                    "All percentiles must be between 0 and 1, inclusive."
                 )
 
             # describe always includes 50th percentile
@@ -3773,9 +3770,9 @@ def pct_change(
             )
         if fill_method not in (no_default, None) or limit is not no_default:
             # Do not remove until pandas 3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
+            assert PANDAS_LT_300, (
+                "Need to drop after pandas-3.0 support is added."
+            )
             warnings.warn(
                 "The 'fill_method' and 'limit' keywords in "
                 f"{type(self).__name__}.pct_change are deprecated and will be "
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 479838ef2a8..50d1a11c39b 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -187,8 +187,7 @@ def __call__(
 
         if padding != "max_length":
             error_msg = (
-                "Only padding to the provided max_length"
-                "is currently supported"
+                "Only padding to the provided max_lengthis currently supported"
             )
             raise NotImplementedError(error_msg)
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 16c7d189dfd..8957ea04fd8 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -31,7 +31,7 @@ def _get_cudf_schema_element_from_dtype(
     dtype = cudf.dtype(dtype)
     if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
-            "CategoricalDtype as dtype is not yet " "supported in JSON reader"
+            "CategoricalDtype as dtype is not yet supported in JSON reader"
         )
 
     lib_type = dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index feb6e12da8c..a7c7136ad4c 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import io
@@ -951,7 +951,7 @@ def _normalize_filters(filters: list | None) -> list[list[tuple]] | None:
     def _validate_predicate(item):
         if not isinstance(item, tuple) or len(item) != 3:
             raise TypeError(
-                f"Predicate must be Tuple[str, str, Any], " f"got {predicate}."
+                f"Predicate must be Tuple[str, str, Any], got {predicate}."
             )
 
     filters = filters if isinstance(filters[0], list) else [filters]
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 79a3a794af3..39d5d36fbcc 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import os
@@ -173,9 +173,7 @@ def _integer_validator(val):
         int(val)
         return True
     except ValueError:
-        raise ValueError(
-            f"{val} is not a valid option. " f"Must be an integer."
-        )
+        raise ValueError(f"{val} is not a valid option. Must be an integer.")
 
 
 def _integer_and_none_validator(val):
@@ -184,7 +182,7 @@ def _integer_and_none_validator(val):
             return
     except ValueError:
         raise ValueError(
-            f"{val} is not a valid option. " f"Must be an integer or None."
+            f"{val} is not a valid option. Must be an integer or None."
         )
 
 
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index 41b9188f036..92b15c51f96 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import operator
 import warnings
@@ -287,8 +287,7 @@ def test_binary_ufunc_series_array(
                 and has_nulls
             ),
             reason=(
-                "cudf and pandas incompatible casting nans "
-                "to nulls in binops"
+                "cudf and pandas incompatible casting nans to nulls in binops"
             ),
         )
     )
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index e18112d03ea..f35bad4a0de 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import codecs
 import gzip
@@ -2165,9 +2165,11 @@ def test_default_integer_bitwidth(
     cudf_mixed_dataframe.to_csv(buf)
     buf.seek(0)
     read = cudf.read_csv(buf)
-    assert read["Integer"].dtype == np.dtype(f"i{default_integer_bitwidth//8}")
+    assert read["Integer"].dtype == np.dtype(
+        f"i{default_integer_bitwidth // 8}"
+    )
     assert read["Integer2"].dtype == np.dtype(
-        f"i{default_integer_bitwidth//8}"
+        f"i{default_integer_bitwidth // 8}"
     )
 
 
@@ -2182,7 +2184,7 @@ def test_default_integer_bitwidth_partial(
     read = cudf.read_csv(buf, dtype={"Integer": "int64"})
     assert read["Integer"].dtype == np.dtype("i8")
     assert read["Integer2"].dtype == np.dtype(
-        f"i{default_integer_bitwidth//8}"
+        f"i{default_integer_bitwidth // 8}"
     )
 
 
@@ -2197,9 +2199,11 @@ def test_default_integer_bitwidth_extremes(
     buf.seek(0)
     read = cudf.read_csv(buf)
 
-    assert read["int64"].dtype == np.dtype(f"i{default_integer_bitwidth//8}")
-    assert read["long"].dtype == np.dtype(f"i{default_integer_bitwidth//8}")
-    assert read["uint64"].dtype == np.dtype(f"u{default_integer_bitwidth//8}")
+    assert read["int64"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
+    assert read["long"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
+    assert read["uint64"].dtype == np.dtype(
+        f"u{default_integer_bitwidth // 8}"
+    )
 
 
 def test_default_float_bitwidth(cudf_mixed_dataframe, default_float_bitwidth):
@@ -2209,7 +2213,7 @@ def test_default_float_bitwidth(cudf_mixed_dataframe, default_float_bitwidth):
     cudf_mixed_dataframe.to_csv(buf)
     buf.seek(0)
     read = cudf.read_csv(buf)
-    assert read["Float"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
+    assert read["Float"].dtype == np.dtype(f"f{default_float_bitwidth // 8}")
 
 
 def test_default_float_bitwidth_partial(default_float_bitwidth):
@@ -2219,7 +2223,7 @@ def test_default_float_bitwidth_partial(default_float_bitwidth):
         StringIO("float1,float2\n1.0,2.0\n3.0,4.0"),
         dtype={"float2": "float64"},
     )
-    assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
+    assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth // 8}")
     assert read["float2"].dtype == np.dtype("f8")
 
 
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index dcde0dab83d..18067d4cf20 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import types
 from contextlib import ExitStack as does_not_raise
@@ -84,8 +84,7 @@ def test_cuda_array_interface_interop_out_masked(dtype, module):
     expectation = does_not_raise()
     if module == "cupy":
         pytest.skip(
-            "cupy doesn't support version 1 of "
-            "`__cuda_array_interface__` yet"
+            "cupy doesn't support version 1 of `__cuda_array_interface__` yet"
         )
         module_constructor = cupy.asarray
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index db34329261f..974510b593b 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import copy
 import gzip
@@ -94,8 +94,7 @@ def json_files(request, tmp_path_factory, pdf):
     index, compression, orient = request.param
     if index is False and orient not in ("split", "table"):
         pytest.skip(
-            "'index=False' is only valid when 'orient' is 'split' or "
-            "'table'"
+            "'index=False' is only valid when 'orient' is 'split' or 'table'"
         )
     if index is False and orient == "table":
         pytest.skip("'index=False' isn't valid when 'orient' is 'table'")
@@ -718,7 +717,7 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine):
     buf.seek(0)
     df = cudf.read_json(buf, engine=engine, lines=True, orient="records")
 
-    assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth//8}")
+    assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
@@ -739,7 +738,7 @@ def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine):
         buf, engine=engine, lines=True, orient="records", dtype={"b": "i8"}
     )
 
-    assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth//8}")
+    assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
     assert df["b"].dtype == np.dtype("i8")
 
 
@@ -753,8 +752,8 @@ def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine):
     )
     df = cudf.read_json(buf, engine=engine, lines=True, orient="records")
 
-    assert df["u8"].dtype == np.dtype(f"u{default_integer_bitwidth//8}")
-    assert df["i8"].dtype == np.dtype(f"i{default_integer_bitwidth//8}")
+    assert df["u8"].dtype == np.dtype(f"u{default_integer_bitwidth // 8}")
+    assert df["i8"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
 
 
 def test_default_float_bitwidth(default_float_bitwidth):
@@ -765,8 +764,8 @@ def test_default_float_bitwidth(default_float_bitwidth):
         lines=True,
         orient="records",
     )
-    assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
-    assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
+    assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth // 8}")
+    assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth // 8}")
 
 
 def test_json_nested_basic():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9ff2a6f0ed7..39a47ee4ccd 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2077,9 +2077,9 @@ def test_parquet_writer_chunked_max_file_size(
     for each_file in all_files:
         # Validate file sizes with some extra 1000
         # bytes buffer to spare
-        assert os.path.getsize(each_file) <= (
-            max_file_size_in_bytes
-        ), "File exceeded max_file_size"
+        assert os.path.getsize(each_file) <= (max_file_size_in_bytes), (
+            "File exceeded max_file_size"
+        )
 
 
 def test_parquet_writer_chunked_max_file_size_error():
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index ba2bd040c38..1e120cfb293 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -433,13 +433,13 @@ def test_scalar_cache_rmm_hook():
 def test_default_integer_bitwidth_scalar(default_integer_bitwidth):
     # Test that integer scalars are default to 32 bits under user options.
     slr = cudf.Scalar(128)
-    assert slr.dtype == np.dtype(f"i{default_integer_bitwidth//8}")
+    assert slr.dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
 
 
 def test_default_float_bitwidth_scalar(default_float_bitwidth):
     # Test that float scalars are default to 32 bits under user options.
     slr = cudf.Scalar(128.0)
-    assert slr.dtype == np.dtype(f"f{default_float_bitwidth//8}")
+    assert slr.dtype == np.dtype(f"f{default_float_bitwidth // 8}")
 
 
 def test_scalar_numpy_casting():
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index d1623024e45..45910c17f95 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2094,13 +2094,13 @@ def test_default_construction():
 )
 def test_default_integer_bitwidth_construction(default_integer_bitwidth, data):
     s = cudf.Series(data)
-    assert s.dtype == np.dtype(f"i{default_integer_bitwidth//8}")
+    assert s.dtype == np.dtype(f"i{default_integer_bitwidth // 8}")
 
 
 @pytest.mark.parametrize("data", [[1.5, 2.5, 4.5], [1000, 2000, 4000, 3.14]])
 def test_default_float_bitwidth_construction(default_float_bitwidth, data):
     s = cudf.Series(data)
-    assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}")
+    assert s.dtype == np.dtype(f"f{default_float_bitwidth // 8}")
 
 
 def test_series_ordered_dedup():
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index b714beb0069..bbd01eaa311 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import itertools
 import operator
@@ -124,8 +124,7 @@ def test_scalar_no_negative_bools():
     with pytest.raises(
         TypeError,
         match=re.escape(
-            "Boolean scalars in cuDF do not "
-            "support negation, use logical not"
+            "Boolean scalars in cuDF do not support negation, use logical not"
         ),
     ):
         -x
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 9e932acb5fa..14b2fa42d16 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -207,8 +207,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
 
     if not cudf.api.types._is_scalar_or_zero_d_array(val):
         raise ValueError(
-            f"Cannot convert value of type {type(val).__name__} "
-            "to cudf scalar"
+            f"Cannot convert value of type {type(val).__name__} to cudf scalar"
         )
 
     if isinstance(val, Decimal):
@@ -588,11 +587,11 @@ def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj:
     """
     if ib := cudf.get_option("default_integer_bitwidth"):
         if dtype.kind == "i":
-            return cudf.dtype(f"i{ib//8}")
+            return cudf.dtype(f"i{ib // 8}")
         elif dtype.kind == "u":
-            return cudf.dtype(f"u{ib//8}")
+            return cudf.dtype(f"u{ib // 8}")
     if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f":
-        return cudf.dtype(f"f{fb//8}")
+        return cudf.dtype(f"f{fb // 8}")
     return dtype
 
 
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index 896a3809c67..dfcad56a339 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 # This function is from the rapidsai/clx repo at below link
 # https://github.com/rapidsai/clx/blob/267c6d30805c9dcbf80840f222bf31c5c4b7068a/python/clx/analytics/_perfect_hash.py
 import numpy as np
@@ -290,8 +290,8 @@ def hash_vocab(
             inner_table_coeffs,
             offsets_into_ht,
         )
-        assert (
-            val == value
-        ), f"Incorrect value found. Got {val} expected {value}"
+        assert val == value, (
+            f"Incorrect value found. Got {val} expected {value}"
+        )
 
     print("All present tokens return correct value.")
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index a04fcb8df7a..e2e60ea1bf0 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import datetime
@@ -1912,9 +1912,9 @@ def get_reader_filepath_or_buffer(
             filepaths_or_buffers = input_sources
             if warn_on_raw_text_input:
                 # Do not remove until pandas 3.0 support is added.
-                assert (
-                    PANDAS_LT_300
-                ), "Need to drop after pandas-3.0 support is added."
+                assert PANDAS_LT_300, (
+                    "Need to drop after pandas-3.0 support is added."
+                )
                 warnings.warn(
                     f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
                     "deprecated and will be removed in a future version. "
diff --git a/python/cudf/cudf/utils/performance_tracking.py b/python/cudf/cudf/utils/performance_tracking.py
index 30c891d0d5a..a8bba7d3e3f 100644
--- a/python/cudf/cudf/utils/performance_tracking.py
+++ b/python/cudf/cudf/utils/performance_tracking.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -58,9 +58,9 @@ def wrapper(*args, **kwargs):
 )
 
 
-def get_memory_records() -> (
-    dict[str, rmm.statistics.ProfilerRecords.MemoryRecord]
-):
+def get_memory_records() -> dict[
+    str, rmm.statistics.ProfilerRecords.MemoryRecord
+]:
     """Get the memory records from the memory profiling
 
     Returns
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 4e3d32c8ed0..c20b0e62d35 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import ast
@@ -220,8 +220,7 @@ def query_execute(df, expr, callenv):
     # wait to check the types until we know which cols are used
     if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays):
         raise TypeError(
-            "query only supports numeric, datetime, timedelta, "
-            "or bool dtypes."
+            "query only supports numeric, datetime, timedelta, or bool dtypes."
         )
 
     colarrays = [col.data_array_view(mode="read") for col in colarrays]
diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
index 94904fd83d4..1eeb9806449 100644
--- a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
+++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
@@ -24,8 +24,10 @@
     "num_columns = 12\n",
     "\n",
     "# Create a DataFrame with random data\n",
-    "df = pd.DataFrame(rng.integers(0, 100, size=(num_rows, num_columns)),\n",
-    "                  columns=[f'Column_{i}' for i in range(1, num_columns + 1)])"
+    "df = pd.DataFrame(\n",
+    "    rng.integers(0, 100, size=(num_rows, num_columns)),\n",
+    "    columns=[f\"Column_{i}\" for i in range(1, num_columns + 1)],\n",
+    ")"
    ]
   },
   {
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index d494e157a18..1528ed2973b 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -1707,9 +1707,9 @@ def test_notebook_slow_repr():
         "tbody",
         "</table>",
     }:
-        assert (
-            string in html_result
-        ), f"Expected string {string} not found in the output"
+        assert string in html_result, (
+            f"Expected string {string} not found in the output"
+        )
 
 
 def test_numpy_ndarray_isinstancecheck(array):
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index d986f150b2e..3c8a97520ad 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Device-aware assertions."""
@@ -135,9 +135,9 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception])
     translator.translate_ir()
     if errors := translator.errors:
         for err in errors:
-            assert any(
-                isinstance(err, err_type) for err_type in exceptions
-            ), f"Translation DID NOT RAISE {exceptions}"
+            assert any(isinstance(err, err_type) for err_type in exceptions), (
+                f"Translation DID NOT RAISE {exceptions}"
+            )
         return
     else:
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}")
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 58c94713d09..787022d2e83 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import io
@@ -195,9 +195,9 @@ def assert_table_and_meta_eq(
     plc_table = plc_table_w_meta.tbl
 
     plc_shape = (plc_table.num_rows(), plc_table.num_columns())
-    assert (
-        plc_shape == pa_table.shape
-    ), f"{plc_shape} is not equal to {pa_table.shape}"
+    assert plc_shape == pa_table.shape, (
+        f"{plc_shape} is not equal to {pa_table.shape}"
+    )
 
     if not check_types_if_empty and plc_table.num_rows() == 0:
         return
@@ -207,9 +207,9 @@ def assert_table_and_meta_eq(
 
     # Check column name equality
     if check_names:
-        assert (
-            plc_table_w_meta.column_names() == pa_table.column_names
-        ), f"{plc_table_w_meta.column_names()} != {pa_table.column_names}"
+        assert plc_table_w_meta.column_names() == pa_table.column_names, (
+            f"{plc_table_w_meta.column_names()} != {pa_table.column_names}"
+        )
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):

From 52cb4e8d71f3c5becc131c89d92ccd17bd8f4952 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 28 Jan 2025 08:50:47 +0000
Subject: [PATCH 24/35] Introduce some more rolling window benchmarks (#17787)

Before embarking on more rolling window performance optimizations and code changes, let's introduce some new benchmarks:

- demonstrating bad algorithmic behavior of large window rolling aggregations;
- of the range-based rolling interface.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17787
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +-
 .../rolling/grouped_range_rolling_sum.cu      | 135 ++++++++++++++++++
 .../rolling/grouped_rolling_sum.cpp           |   9 ++
 cpp/benchmarks/rolling/range_rolling_sum.cu   | 122 ++++++++++++++++
 cpp/benchmarks/rolling/rolling_sum.cpp        |  14 +-
 5 files changed, 281 insertions(+), 4 deletions(-)
 create mode 100644 cpp/benchmarks/rolling/grouped_range_rolling_sum.cu
 create mode 100644 cpp/benchmarks/rolling/range_rolling_sum.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 0ff712c1c77..03f11cc957b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -428,7 +428,10 @@ ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
 # ##################################################################################################
 # * rolling benchmark
 # ---------------------------------------------------------------------------------
-ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
+ConfigureNVBench(
+  ROLLING_NVBENCH rolling/grouped_range_rolling_sum.cu rolling/grouped_rolling_sum.cpp
+  rolling/range_rolling_sum.cu rolling/rolling_sum.cpp
+)
 
 add_custom_target(
   run_benchmarks
diff --git a/cpp/benchmarks/rolling/grouped_range_rolling_sum.cu b/cpp/benchmarks/rolling/grouped_range_rolling_sum.cu
new file mode 100644
index 00000000000..9f89861c528
--- /dev/null
+++ b/cpp/benchmarks/rolling/grouped_range_rolling_sum.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+void bench_grouped_range_rolling_sum(nvbench::state& state)
+{
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  // Configurable parameter is window range.
+  // Since orderby column is approximately equally spaced at unit
+  // intervals, this approximately controls the number of entries in
+  // the window.
+  auto const preceding_range = cudf::numeric_scalar<cudf::size_type>{
+    static_cast<cudf::size_type>(state.get_int64("preceding_range") * 1000), true};
+  auto const following_range = cudf::numeric_scalar<cudf::size_type>{
+    static_cast<cudf::size_type>(state.get_int64("preceding_range") * 1000), true};
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  auto vals = [&] {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<std::int32_t>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<std::int32_t>(), row_count{num_rows}, profile);
+  }();
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<cudf::size_type>(), distribution_id::UNIFORM, 0, num_rows);
+    auto keys =
+      create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{num_rows}, profile);
+    return cudf::sort(cudf::table_view{{keys->view()}});
+  }();
+  auto orderby = [&] {
+    auto seq =
+      cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()}, num_rows);
+    // Equally spaced rows separated by 1000 unit intervals
+    thrust::tabulate(
+      rmm::exec_policy(cudf::get_default_stream()),
+      seq->mutable_view().begin<cudf::size_type>(),
+      seq->mutable_view().end<cudf::size_type>(),
+      [] __device__(cudf::size_type i) { return static_cast<cudf::size_type>(i) * 1000; });
+    // Add some units of noise
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<cudf::duration_ms>(), distribution_id::NORMAL, -2000, 2000);
+    profile.set_null_probability(has_nulls ? std::optional<double>{400.0 / num_rows}
+                                           : std::nullopt);
+    auto noise =
+      create_random_column(cudf::type_to_id<cudf::size_type>(), row_count{num_rows}, profile);
+    auto result =
+      cudf::binary_operation(seq->view(), noise->view(), cudf::binary_operator::ADD, seq->type());
+    auto columns = cudf::sort_by_key(cudf::table_view{{result->view()}},
+                                     cudf::table_view{{keys->get_column(0).view(), result->view()}},
+                                     {cudf::order::ASCENDING, cudf::order::ASCENDING},
+                                     {cudf::null_order::AFTER, cudf::null_order::AFTER})
+                     ->release();
+    return std::move(columns[0]);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::grouped_range_rolling_window(keys->view(),
+                                         orderby->view(),
+                                         cudf::order::ASCENDING,
+                                         vals->view(),
+                                         cudf::range_window_bounds::get(preceding_range),
+                                         cudf::range_window_bounds::get(following_range),
+                                         1,
+                                         *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH(bench_grouped_range_rolling_sum)
+  .set_name("range_grouped_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_range", {100})
+  .add_int64_axis("following_range", {100})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
+
+NVBENCH_BENCH(bench_grouped_range_rolling_sum)
+  .set_name("range_grouped_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_range", {10'000, 40'000})
+  .add_int64_axis("following_range", {0})
+  .add_int64_axis("has_nulls", {0, 1})
+  .add_int64_axis("cardinality", {100});
diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
index 04afe5ac661..a92443c88ec 100644
--- a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
+++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
@@ -68,3 +68,12 @@ NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
   .add_int64_axis("following_size", {2})
   .add_int64_axis("min_periods", {1})
   .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
+
+NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t>))
+  .set_name("row_grouped_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_size", {10'000, 40'000})
+  .add_int64_axis("following_size", {0})
+  .add_int64_axis("min_periods", {1})
+  .add_int64_axis("cardinality", {10, 100});
diff --git a/cpp/benchmarks/rolling/range_rolling_sum.cu b/cpp/benchmarks/rolling/range_rolling_sum.cu
new file mode 100644
index 00000000000..7299f1f616b
--- /dev/null
+++ b/cpp/benchmarks/rolling/range_rolling_sum.cu
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+
+void bench_range_rolling_sum(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  // Configurable parameter is window range in seconds.
+  // Since orderby column is approximately equally spaced at 1s
+  // intervals, this approximately controls the number of entries in
+  // the window.
+  auto const preceding_range = cudf::duration_scalar<cudf::duration_ms>{
+    cudf::duration_ms{state.get_int64("preceding_range") * 1000}, true};
+  auto const following_range = cudf::duration_scalar<cudf::duration_ms>{
+    cudf::duration_ms{state.get_int64("following_range") * 1000}, true};
+  auto const has_nulls = static_cast<bool>(state.get_int64("has_nulls"));
+
+  auto vals = [&] {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<std::int32_t>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<std::int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto orderby = [&] {
+    auto seq = cudf::make_timestamp_column(cudf::data_type{cudf::type_to_id<cudf::timestamp_ms>()},
+                                           num_rows);
+    // Equally spaced rows separated by 1s
+    thrust::tabulate(
+      rmm::exec_policy(cudf::get_default_stream()),
+      seq->mutable_view().begin<cudf::timestamp_ms>(),
+      seq->mutable_view().end<cudf::timestamp_ms>(),
+      [] __device__(cudf::size_type i) {
+        return cudf::timestamp_ms{cudf::duration_ms{static_cast<std::int64_t>(i) * 1000}};
+      });
+    // Add some milliseconds of noise
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<cudf::duration_ms>(), distribution_id::NORMAL, -2000, 2000);
+    profile.set_null_probability(has_nulls ? std::optional<double>{400.0 / num_rows}
+                                           : std::nullopt);
+    auto noise =
+      create_random_column(cudf::type_to_id<cudf::duration_ms>(), row_count{num_rows}, profile);
+    auto result =
+      cudf::binary_operation(seq->view(), noise->view(), cudf::binary_operator::ADD, seq->type());
+    auto columns =
+      cudf::sort(
+        cudf::table_view{{result->view()}}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER})
+        ->release();
+    return std::move(columns[0]);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::grouped_range_rolling_window(cudf::table_view{},
+                                         orderby->view(),
+                                         cudf::order::ASCENDING,
+                                         vals->view(),
+                                         cudf::range_window_bounds::get(preceding_range),
+                                         cudf::range_window_bounds::get(following_range),
+                                         1,
+                                         *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH(bench_range_rolling_sum)
+  .set_name("range_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_range", {100})
+  .add_int64_axis("following_range", {100})
+  .add_int64_axis("has_nulls", {0, 1});
+
+NVBENCH_BENCH(bench_range_rolling_sum)
+  .set_name("range_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_range", {10'000, 40'000})
+  .add_int64_axis("following_range", {0})
+  .add_int64_axis("has_nulls", {0, 1});
diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp
index af9ecd6a26f..a1d084a28ef 100644
--- a/cpp/benchmarks/rolling/rolling_sum.cpp
+++ b/cpp/benchmarks/rolling/rolling_sum.cpp
@@ -66,13 +66,13 @@ void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list<Ty
   auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
   auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
 
-  auto vals = [&]() {
+  auto vals = [&] {
     data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
       cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
     return create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
   }();
 
-  auto preceding = [&]() {
+  auto preceding = [&] {
     auto data = std::vector<cudf::size_type>(num_rows);
     auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
     std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) {
@@ -88,7 +88,7 @@ void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list<Ty
                                           0);
   }();
 
-  auto following = [&]() {
+  auto following = [&] {
     auto data = std::vector<cudf::size_type>(num_rows);
     auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
     std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) {
@@ -132,3 +132,11 @@ NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum,
   .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
   .add_int64_axis("preceding_size", {10, 100})
   .add_int64_axis("following_size", {2});
+
+NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t>))
+  .set_name("row_fixed_rolling_sum_large_windows")
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("preceding_size", {10'000, 40'000})
+  .add_int64_axis("following_size", {0})
+  .add_int64_axis("min_periods", {1});

From fa20521c6f067f587944954d4587d254cab6ffe9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 28 Jan 2025 03:45:50 -0600
Subject: [PATCH 25/35] Increase timeout for recently added test (#17829)

There is a timeout failure in nightly tests: https://github.com/rapidsai/cudf/actions/runs/12983287834/job/36204344253

It looks like CI runs can get very slow at times, hence bumping up the timeout. This test basically guards us to test against a hang, so 20s timeout should be good too.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17829
---
 .../cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
index c7af6cc5ebf..738f3c05555 100644
--- a/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
+++ b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
@@ -11,7 +11,7 @@ def test_disable_pandas_accelerator_multi_threaded():
     # Create a copy of the current environment variables
     env = os.environ.copy()
 
-    with utils.cudf_timeout(10):
+    with utils.cudf_timeout(20):
         sp_completed = subprocess.run(
             [
                 "python",

From f0a3dfe6a7757068182c32c176a81bc7167ebd87 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 28 Jan 2025 06:54:34 -0800
Subject: [PATCH 26/35] Fix rolling(min_periods=) with int and null data with
 mode.pandas_compat (#17822)

closes https://github.com/rapidsai/cudf/issues/17786

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17822
---
 python/cudf/cudf/core/window/rolling.py |  7 +++++--
 python/cudf/cudf/tests/test_rolling.py  | 15 ++++++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index dc43f297416..187d1b58dca 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -23,6 +23,7 @@
 
 if TYPE_CHECKING:
     from cudf.core.column.column import ColumnBase
+    from cudf.core.indexed_frame import IndexedFrame
 
 
 class _RollingBase:
@@ -205,7 +206,7 @@ class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible):
 
     def __init__(
         self,
-        obj,
+        obj: IndexedFrame,
         window,
         min_periods=None,
         center: bool = False,
@@ -216,7 +217,9 @@ def __init__(
         step: int | None = None,
         method: str = "single",
     ):
-        self.obj = obj
+        if cudf.get_option("mode.pandas_compatible"):
+            obj = obj.nans_to_nulls()
+        self.obj = obj  # type: ignore[assignment]
         self.window = window
         self.min_periods = min_periods
         self.center = center
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 135870f7359..31f799fe6d7 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import math
 
@@ -517,3 +517,16 @@ def test_rolling_series():
     actual = df.groupby("b")["a"].rolling(5).mean()
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("klass", ["DataFrame", "Series"])
+def test_pandas_compat_int_nan_min_periods(klass):
+    data = [None, 1, 2, None, 4, 6, 11]
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = getattr(cudf, klass)(data).rolling(2, min_periods=1).sum()
+    expected = getattr(pd, klass)(data).rolling(2, min_periods=1).sum()
+    assert_eq(result, expected)
+
+    result = getattr(cudf, klass)(data).rolling(2, min_periods=1).sum()
+    expected = getattr(cudf, klass)([None, 1, 3, 2, 4, 10, 17])
+    assert_eq(result, expected)

From be1f76c853b8eea6700dc150c16cb9c72df51de1 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 28 Jan 2025 11:25:36 -0500
Subject: [PATCH 27/35] Improve parquet reader very-long string performance
 (#17773)

The previous [strings PR](https://github.com/rapidsai/cudf/pull/17286) significantly reduced the parquet reader string performance for very-long strings, for lengths ~1024 and longer. This PR fixes the performance issue by instituting a max memcpy length of 8 bytes at once (this length yielded best perf). Also, up to all of the threads in the block can work on the same string, rather than limiting it to just all of the threads in a warp.

**PERFORMANCE:**
Short strings: Unchanged
Length 1024: 25% faster
Longer lengths (up to 64k): Up to 90% faster, same as before strings PR

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/17773
---
 .../io/parquet/parquet_reader_input.cpp       | 11 ++-
 cpp/src/io/parquet/page_string_utils.cuh      | 75 +++++++++++--------
 2 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 83e6c35216a..af701182b0f 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -97,7 +97,6 @@ void BM_parquet_read_data(nvbench::state& state,
 void BM_parquet_read_long_strings(nvbench::state& state)
 {
   auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
 
   auto const d_type      = get_type_or_group(static_cast<int32_t>(data_type::STRING));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
@@ -106,14 +105,15 @@ void BM_parquet_read_long_strings(nvbench::state& state)
 
   auto const avg_string_length = static_cast<cudf::size_type>(state.get_int64("avg_string_length"));
   // corresponds to 3 sigma (full width 6 sigma: 99.7% of range)
-  auto const half_width = static_cast<cudf::size_type>(state.get_int64("half_width_string_length"));
+  auto const half_width =
+    avg_string_length >> 3;  // 32 +/- 4, 128 +/- 16, 1024 +/- 128, 8k +/- 1k, etc.
   auto const length_min = avg_string_length - half_width;
   auto const length_max = avg_string_length + half_width;
 
   data_profile profile =
     data_profile_builder()
       .cardinality(cardinality)
-      .avg_run_length(run_length)
+      .avg_run_length(1)
       .distribution(data_type::STRING, distribution_id::NORMAL, length_min, length_max);
 
   auto const num_rows_written = [&]() {
@@ -414,6 +414,5 @@ NVBENCH_BENCH(BM_parquet_read_long_strings)
   .add_string_axis("io_type", {"DEVICE_BUFFER"})
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32})
-  .add_int64_axis("avg_string_length", {16, 48, 96})
-  .add_int64_axis("half_width_string_length", {16, 32, 64});  // length = avg +/- half_width
+  .add_int64_power_of_two_axis("avg_string_length",
+                               nvbench::range(4, 16, 2));  // 16, 64, ... -> 64k
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index ba627e73625..9a3763a826c 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -150,39 +150,37 @@ inline __device__ void compute_initial_large_strings_offset(page_state_s const*
   }
 }
 
+template <int value>
+inline constexpr int log2_int()
+{
+  static_assert((value >= 1) && ((value & (value - 1)) == 0), "Only works for powers of 2!");
+  if constexpr (value == 1)
+    return 0;
+  else
+    return 1 + log2_int<value / 2>();
+}
+
 template <int block_size>
-__device__ inline int calc_threads_per_string_log2(int avg)
+__device__ inline int calc_threads_per_string_log2(int avg_string_length)  // returns log2(M)
 {
   // From testing, performance is best when copying an average of B = 4 bytes at once.
   // So #-threads-per-string M = avg_string_length / 4
   // Help the compiler make the code fast by keeping everything a power of 2
   // For avg length < 4/8/16/..., length power-of-2 = 2/3/4/.../. Divide by 4: 0/1/...
-  // This is the target (log2) for M, but we need to clamp its range
 
-  // Clamp M (#-threads-per-string):
-  // For T threads: clamp #-strings-at-once N btw T/32 (1/warp) & 32 (cache miss if larger)
-  // So, clamp #-threads-per-string M = T / N between 32 (all in warp) & T/32 (cache miss)
-  // Writing an equation M(T) is slower than just handling each T case separately
-  auto caster = [](int value) { return static_cast<int>(value != 0); };  // branchless
-
-  if constexpr (block_size > 512) {
-    return 5;  // max of 32 strings at a time, no matter what
-  } else if constexpr (block_size > 256) {
-    return (avg < 64) ? 4 : 5;
-  } else if constexpr (block_size > 128) {
-    //(avg < 32) ? 3 : ((avg < 64) ? 4 : 5);
-    return 3 + caster(avg >> 5) + caster(avg >> 6);
-  } else if constexpr (block_size > 64) {
-    //(avg < 16) ? 2 : ((avg < 32) ? 3 : ((avg < 64) ? 4 : 5));
-    return 2 + caster(avg >> 4) + caster(avg >> 5) + caster(avg >> 6);
-  } else if constexpr (block_size > 32) {
-    //(avg < 8) ? 1 : ((avg < 16) ? 2 : ((avg < 32) ? 3 : ((avg < 64) ? 4 : 5)));
-    return 1 + caster(avg >> 3) + caster(avg >> 4) + caster(avg >> 5) + caster(avg >> 6);
-  } else {  // One warp
-    //(avg<4) ? 0 : ((avg<8) ? 1 : ((avg<16) ? 2 : ((avg<32) ? 3 : ((avg<64) ? 4 : 5))));
-    return caster(avg >> 2) + caster(avg >> 3) + caster(avg >> 4) + caster(avg >> 5) +
-           caster(avg >> 6);
-  }
+  // avg - 1: Don't want extra thread at powers of 2 (e.g. 32 (0b100000 -> 0b11111 -> 5)
+  int const avg_log2     = 32 - __clz(avg_string_length - 1);
+  int const threads_log2 = avg_log2 - 2;  // Target 4 bytes / thread at once (log2(4) = 2)
+
+  // This is the target (log2) for M, but we need to clamp its range
+  // First clamp #-strings-at-once (N) btw 1 (all threads (T)) & 32 (cache miss if larger)
+  // So, clamp #-threads-per-string M = T / N between: T (all) & T/32 (cache miss)
+  // So, clamp log2(#-threads-per-string) between log2(T) & log2(T) - 5 (min 1)
+  static constexpr int block_size_log2  = log2_int<block_size>();  // 7 for block_size = 128
+  static constexpr int min_threads_log2 = cuda::std::max(block_size_log2 - 5, 1);
+
+  // Clamp log2(M) (between 2 and 7 for block_size = 128)
+  return cuda::std::max(min_threads_log2, cuda::std::min(block_size_log2, threads_log2));
 }
 
 /**
@@ -322,10 +320,27 @@ __device__ size_t gpuDecodeString(
         auto output_string = outputs[str_idx];
         int const length   = lengths[str_idx];
 
-        // One-shot N chars per thread
-        int const chars_at_once    = (length + threads_per_string - 1) >> threads_per_string_log2;
-        int const start_index      = string_lane * chars_at_once;
-        int const substring_length = min(chars_at_once, length - start_index);
+        // Max 8 chars at once per thread, else perf degrades dramatically
+        // Loop, copying 8 chars at a time, until <= 8 chars per thread left
+        static constexpr int max_chars_at_once = 8;
+        int chars_remaining_per_thread =
+          (length + threads_per_string - 1) >> threads_per_string_log2;
+        int group_offset = 0;
+        if (chars_remaining_per_thread > max_chars_at_once) {
+          int const max_chars_copied_string = max_chars_at_once * threads_per_string;
+          int start_index                   = string_lane * max_chars_at_once;
+          do {
+            memcpy(&(output_string[start_index]), &(input_string[start_index]), max_chars_at_once);
+
+            chars_remaining_per_thread -= max_chars_at_once;
+            start_index += max_chars_copied_string;
+            group_offset += max_chars_copied_string;
+          } while (chars_remaining_per_thread > max_chars_at_once);
+        }
+
+        // Final copy of remaining chars
+        int const start_index      = group_offset + string_lane * chars_remaining_per_thread;
+        int const substring_length = min(chars_remaining_per_thread, length - start_index);
         if (substring_length > 0) {
           memcpy(&(output_string[start_index]), &(input_string[start_index]), substring_length);
         }

From 53174fdcb2280539ab7c91b6ceccebf5e4aee114 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 28 Jan 2025 10:05:39 -0800
Subject: [PATCH 28/35] Enforce schema for partial tables in multi-source
 multi-batch JSON reader (#17708)

Closes #17689

This PR resolves a bug in the multi-batch JSON reader, wherein the reader was throwing an error when the column schema for any two partial tables from different batches did not match. We now enforce the column ordering in the first partial table i.e. the table returned by the first batch in all succeeding batches.
The test added passes three string as three separate batches to the reader by setting the batch size to that of the first string.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/17708
---
 cpp/src/io/json/read_json.cu          | 70 ++++++++++++++++++++++++-
 cpp/tests/large_strings/json_tests.cu | 74 +++++++++++++++++++++++++--
 2 files changed, 137 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 9894ad75fc8..419e7bb120f 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -36,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/distance.h>
+#include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
@@ -372,7 +373,9 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
   std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
   std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
   std::size_t const batch_size =
-    batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
+    batch_size_upper_bound < (max_subchunks_prealloced * size_per_subchunk)
+      ? batch_size_upper_bound
+      : batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
 
   /*
    * Identify the position (zero-indexed) of starting source file from which to begin
@@ -421,10 +424,73 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
+
+  // recursive lambda to construct schema_element. Here, we assume that the table from the
+  // first batch contains all the columns in the concatenated table, and that the partial tables
+  // from all following batches contain the same set of columns
+  std::function<schema_element(cudf::host_span<column_view const> cols,
+                               cudf::host_span<column_name_info const> names,
+                               schema_element & schema)>
+    construct_schema;
+  schema_element schema{data_type{cudf::type_id::STRUCT}};
+  construct_schema = [&construct_schema](cudf::host_span<column_view const> children,
+                                         cudf::host_span<column_name_info const> children_props,
+                                         schema_element& schema) -> schema_element {
+    CUDF_EXPECTS(
+      children.size() == children_props.size(),
+      "Mismatch in the number of children columns and children column properties received");
+
+    if (schema.type == data_type{cudf::type_id::LIST}) {
+      schema.column_order = {"element"};
+      CUDF_EXPECTS(children.size() == 2, "List should have two children");
+      auto element_idx = children_props[0].name == "element" ? 0 : 1;
+      schema_element child_schema{children[element_idx].type()};
+      std::vector<column_view> grandchildren_cols;
+      std::transform(children[element_idx].child_begin(),
+                     children[element_idx].child_end(),
+                     std::back_inserter(grandchildren_cols),
+                     [](auto& gc) { return gc; });
+      schema.child_types["element"] =
+        construct_schema(grandchildren_cols, children_props[element_idx].children, child_schema);
+    } else {
+      std::vector<std::string> col_order;
+      std::transform(children_props.begin(),
+                     children_props.end(),
+                     std::back_inserter(col_order),
+                     [](auto& c_prop) { return c_prop.name; });
+      schema.column_order = std::move(col_order);
+      for (auto i = 0ul; i < children.size(); i++) {
+        schema_element child_schema{children[i].type()};
+        std::vector<column_view> grandchildren_cols;
+        std::transform(children[i].child_begin(),
+                       children[i].child_end(),
+                       std::back_inserter(grandchildren_cols),
+                       [](auto& gc) { return gc; });
+        schema.child_types[children_props[i].name] =
+          construct_schema(grandchildren_cols, children_props[i].children, child_schema);
+      }
+    }
+
+    return schema;
+  };
+  batched_reader_opts.set_byte_range_offset(batch_offsets[0]);
+  batched_reader_opts.set_byte_range_size(batch_offsets[1] - batch_offsets[0]);
+  partial_tables.emplace_back(
+    read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref()));
+
+  auto& tbl = partial_tables.back().tbl;
+  std::vector<column_view> children;
+  for (size_type j = 0; j < tbl->num_columns(); j++) {
+    children.emplace_back(tbl->get_column(j));
+  }
+  batched_reader_opts.set_dtypes(
+    construct_schema(children, partial_tables.back().metadata.schema_info, schema));
+  batched_reader_opts.enable_prune_columns(true);
+
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (std::size_t i = 0; i < batch_offsets.size() - 1; i++) {
+  for (std::size_t i = 1; i < batch_offsets.size() - 1; i++) {
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index 0703fa72f67..205fb12c4dd 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,15 @@
 #include <cudf/utilities/span.hpp>
 
 struct JsonLargeReaderTest : public cudf::test::StringsLargeTest,
-                             public testing::WithParamInterface<cudf::io::compression_type> {};
+                             public testing::WithParamInterface<cudf::io::compression_type> {
+ public:
+  void set_batch_size(size_t batch_size_upper_bound)
+  {
+    setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+  }
+
+  ~JsonLargeReaderTest() { unsetenv("LIBCUDF_JSON_BATCH_SIZE"); }
+};
 
 // Parametrize qualifying JSON tests for multiple compression types
 INSTANTIATE_TEST_SUITE_P(JsonLargeReaderTest,
@@ -47,7 +55,7 @@ TEST_P(JsonLargeReaderTest, MultiBatch)
 
   std::size_t const batch_size_upper_bound = std::numeric_limits<int32_t>::max() / 16;
   // set smaller batch_size to reduce file size and execution time
-  setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+  this->set_batch_size(batch_size_upper_bound);
 
   constexpr std::size_t expected_file_size = 1.5 * static_cast<double>(batch_size_upper_bound);
   std::size_t const log_repetitions =
@@ -127,7 +135,63 @@ TEST_P(JsonLargeReaderTest, MultiBatch)
     // cannot use EQUAL due to concatenate removing null mask
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
+}
+
+TEST_P(JsonLargeReaderTest, MultiBatchWithNulls)
+{
+  cudf::io::compression_type const comptype = GetParam();
+
+  // The goal of this test is to ensure that column schema from the first
+  // batch is enforced on all following batches in the JSON reader. The column
+  // ordering from the first batch is applied to batches 2 and 3.
+  std::string json_string_b1 = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  std::string json_string_b2 = R"(
+    { "a": { "y" : 6}, "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  std::string json_string_b3 = R"(
+    { "b" : [1, 2, 3], "a": { "y" : 6}}
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+
+  // Set the batch size to the size of the first json string, `json_string_b1`.
+  std::size_t const batch_size_upper_bound = json_string_b1.size();
+  // set smaller batch_size to reduce file size and execution time
+  this->set_batch_size(batch_size_upper_bound);
+
+  auto json_string = json_string_b1 + json_string_b2 + json_string_b3;
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
 
-  // go back to normal batch_size
-  unsetenv("LIBCUDF_JSON_BATCH_SIZE");
+  constexpr int num_sources = 2;
+  std::vector<cudf::host_span<std::byte>> chostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(cdata.data()), cdata.size()));
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options cjson_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<std::byte>>(chostbufs.data(), chostbufs.size())})
+      .lines(true)
+      .compression(comptype)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  CUDF_EXPECT_NO_THROW(cudf::io::read_json(cjson_lines_options));
 }

From 39bcd166023ebf3f815a3b9f356c1f4b2eb06a00 Mon Sep 17 00:00:00 2001
From: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 13:19:26 -0800
Subject: [PATCH 29/35] Fix typo in exception raised when attempting to convert
 a string column to cupy (#17800)

Closes #17799

Fix typo in exception raised when attempting to convert a string column to cupy

Authors:
  - David Gardner (https://github.com/dagardner-nv)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17800
---
 python/cudf/cudf/core/column/string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1c82fa28d4e..074da57c470 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5929,7 +5929,7 @@ def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the StringColumn.
         """
-        raise TypeError("String Arrays is not yet implemented in cudf")
+        raise TypeError("String arrays are not supported by cupy")
 
     def to_pandas(
         self,

From 95c69c332b66bd7a77d3d75fb84c662140fffee2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 28 Jan 2025 19:18:35 -0800
Subject: [PATCH 30/35] Fix possible OOB mem access in Parquet decoder (#17841)

Fixes #17838. Related to #17702

This PR fixes a possible OOB in parquet string decoder when writing initial offset for nested large string cols. Existing tests should have been throwing segfaults in decoder kernels but somehow weren't. The decoder was producing correct results even without this change as the initial offsets are written from the first decoded ColumnChunk of each input column.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17841
---
 cpp/src/io/parquet/decode_fixed.cu      |  6 ++++--
 cpp/src/io/parquet/page_delta_decode.cu | 12 ++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 84f751dea6b..dd0ae2c087f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -1167,8 +1167,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     // For large strings, update the initial string buffer offset to be used during large string
     // column construction. Otherwise, convert string sizes to final offsets.
     if (s->col.is_large_string_col) {
-      compute_initial_large_strings_offset(
-        s, initial_str_offsets[pages[page_idx].chunk_idx], has_lists_t);
+      // page.chunk_idx are ordered by input_col_idx and row_group_idx respectively.
+      auto const chunks_per_rowgroup = initial_str_offsets.size();
+      auto const input_col_idx       = pages[page_idx].chunk_idx % chunks_per_rowgroup;
+      compute_initial_large_strings_offset(s, initial_str_offsets[input_col_idx], has_lists_t);
     } else {
       convert_small_string_lengths_to_offsets<decode_block_size_t>(s, has_lists_t);
     }
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 4c98a08006c..a95958c2704 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -583,8 +583,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   // For large strings, update the initial string buffer offset to be used during large string
   // column construction. Otherwise, convert string sizes to final offsets.
   if (s->col.is_large_string_col) {
-    compute_initial_large_strings_offset(
-      s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition);
+    // page.chunk_idx are ordered by input_col_idx and row_group_idx respectively.
+    auto const chunks_per_rowgroup = initial_str_offsets.size();
+    auto const input_col_idx       = pages[page_idx].chunk_idx % chunks_per_rowgroup;
+    compute_initial_large_strings_offset(s, initial_str_offsets[input_col_idx], has_repetition);
   } else {
     convert_small_string_lengths_to_offsets<decode_block_size>(s, has_repetition);
   }
@@ -742,8 +744,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   // For large strings, update the initial string buffer offset to be used during large string
   // column construction. Otherwise, convert string sizes to final offsets.
   if (s->col.is_large_string_col) {
-    compute_initial_large_strings_offset(
-      s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition);
+    // page.chunk_idx are ordered by input_col_idx and row_group_idx respectively.
+    auto const chunks_per_rowgroup = initial_str_offsets.size();
+    auto const input_col_idx       = pages[page_idx].chunk_idx % chunks_per_rowgroup;
+    compute_initial_large_strings_offset(s, initial_str_offsets[input_col_idx], has_repetition);
   } else {
     convert_small_string_lengths_to_offsets<decode_block_size>(s, has_repetition);
   }

From 367405ff4eb3a7e42ecd19cf68947936a64394e7 Mon Sep 17 00:00:00 2001
From: Taurean Dyer <46935140+taureandyernv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 19:33:17 -0800
Subject: [PATCH 31/35] Update cudf.pandas colab link in docs (#17846)

I changed the link to the cudf-pandas notebook in the docs from the secured link to the new public one

Closes #17845

Authors:
  - Taurean Dyer (https://github.com/taureandyernv)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17846
---
 docs/cudf/source/cudf_pandas/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cudf/source/cudf_pandas/index.rst b/docs/cudf/source/cudf_pandas/index.rst
index f98c04cc383..e9f36e0f4ea 100644
--- a/docs/cudf/source/cudf_pandas/index.rst
+++ b/docs/cudf/source/cudf_pandas/index.rst
@@ -20,7 +20,7 @@ automatically **falling back to pandas** for other operations.
 
 .. figure:: ../_static/colab.png
     :width: 200px
-    :target: https://nvda.ws/rapids-cudf
+    :target: https://nvda.ws/3BnjYjN
 
     Try it on Google Colab!
 

From ed2f3c3033c5616a1d3b888f78ef10187add92d3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 28 Jan 2025 22:01:18 -0600
Subject: [PATCH 32/35] Add public APIs to Access Underlying `cudf` and
 `pandas` Objects from `cudf.pandas` Proxy Objects (#17629)

Fixes: #17524
Fixes: https://github.com/rapidsai/cuml/issues/6232
This PR introduces methods to access the real underlying `cudf` and `pandas` objects from `cudf.pandas` proxy objects. These methods ensure compatibility with libraries that are `cudf` or `pandas` aware.


This PR also gives a performance boost to `cudf-pandas` workflows, speeds from the script posted in https://github.com/rapidsai/cuml/issues/6232:

`branch-25.02`:
```
cuML Label Encoder with cuDF-Pandas took 2.00794 seconds
```
`This PR`:
```
cuML Label Encoder with cuDF-Pandas took 0.09284 seconds
```


Changes:

- [x] Added `get_gpu_object()` and `get_cpu_object()` methods.
- [x] Updated faq.md with a section explaining how to use these methods.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17629
---
 docs/cudf/source/cudf_pandas/faq.md           | 47 +++++++++++++++++++
 python/cudf/cudf/core/column/column.py        |  2 +-
 python/cudf/cudf/pandas/__init__.py           |  3 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 15 +++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 10 +++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 34 ++++++++++++++
 6 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 222b698a78d..4e3cc319605 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -142,6 +142,53 @@ cuDF (learn more in [this
 blog](https://medium.com/rapids-ai/easy-cpu-gpu-arrays-and-dataframes-run-your-dask-code-where-youd-like-e349d92351d)) and the [RAPIDS Accelerator for Apache Spark](https://nvidia.github.io/spark-rapids/)
 provides a similar configuration-based plugin for Spark.
 
+## How do I know if an object is a `cudf.pandas` proxy object?
+
+To determine if an object is a `cudf.pandas` proxy object, you can use the `isinstance_cudf_pandas` API. This function checks if the given object is a proxy object that wraps either a `cudf` or `pandas` object. Here is an example of how to use this API:
+
+```python
+from cudf.pandas import isinstance_cudf_pandas
+
+obj = ...  # Your object here
+if isinstance_cudf_pandas(obj, pd.Series):
+    print("The object is a cudf.pandas proxy Series object.")
+else:
+    print("The object is not a cudf.pandas proxy Series object.")
+```
+
+To detect `Series`, `DataFrame`, `Index`, and `ndarray` objects separately, you can pass the type names as the second parameter:
+
+* `isinstance_cudf_pandas(obj, pd.Series)`: Detects if the object is a `cudf.pandas` proxy `Series`.
+* `isinstance_cudf_pandas(obj, pd.DataFrame)`: Detects if the object is a `cudf.pandas` proxy `DataFrame`.
+* `isinstance_cudf_pandas(obj, pd.Index)`: Detects if the object is a `cudf.pandas` proxy `Index`.
+* `isinstance_cudf_pandas(obj, np.ndarray)`: Detects if the object is a `cudf.pandas` proxy `ndarray`.
+
+## How can I access the underlying GPU or CPU objects?
+
+When working with `cudf.pandas` proxy objects, it is sometimes necessary to get true `cudf` or `pandas` objects that reside on GPU or CPU.
+For example, this can be used to ensure that GPU-aware libraries that support both `cudf` and `pandas` can use the `cudf`-optimized code paths that keep data on GPU when processing `cudf.pandas` objects.
+Otherwise, the library might use less-optimized CPU code because it thinks that the `cudf.pandas` object is a plain `pandas` dataframe.
+
+The following methods can be used to retrieve the actual `cudf` or `pandas` objects:
+
+- `as_gpu_object()`: This method returns the `cudf` object from the proxy.
+- `as_cpu_object()`: This method returns the `pandas` object from the proxy.
+
+If `as_gpu_object()` is called on a proxy array, it will return a `cupy` array and `as_cpu_object` will return a `numpy` array.
+
+Here is an example of how to use these methods:
+
+```python
+# Assuming `proxy_obj` is a cudf.pandas proxy object
+cudf_obj = proxy_obj.as_gpu_object()
+pandas_obj = proxy_obj.as_cpu_object()
+
+# Now you can use `cudf_obj` and `pandas_obj` with libraries that are cudf or pandas aware
+```
+
+Be aware that if `cudf.pandas` objects are converted to their underlying `cudf` or `pandas` types, the `cudf.pandas` proxy no longer controls them.
+This means that automatic conversion between GPU and CPU types and automatic fallback from GPU to CPU functionality will not occur.
+
 (are-there-any-known-limitations)=
 ## Are there any known limitations?
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index be0758041f4..57d1ad56f82 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1251,7 +1251,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
             )
 
         # Categories must be unique and sorted in ascending order.
-        cats = self.unique().sort_values().astype(self.dtype)
+        cats = self.unique().sort_values()
         label_dtype = min_unsigned_type(len(cats))
         labels = self._label_encoding(
             cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index fec181e85d7..70ab7d48879 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -8,6 +8,7 @@
 import pylibcudf
 import rmm.mr
 
+from ._wrappers.pandas import isinstance_cudf_pandas
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index e763875adb8..41d1789116c 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import abc
@@ -35,7 +35,9 @@
     _fast_slow_function_call,
     _FastSlowAttribute,
     _FunctionProxy,
+    _maybe_wrap_result,
     _Unusable,
+    is_proxy_object,
     make_final_proxy_type as _make_final_proxy_type,
     make_intermediate_proxy_type as _make_intermediate_proxy_type,
     register_proxy_func,
@@ -266,6 +268,12 @@ def custom_repr_html(obj):
     html_formatter.for_type(DataFrame, custom_repr_html)
 
 
+def _Series_dtype(self):
+    # Fast-path to extract dtype from the current
+    # object without round-tripping through the slow<->fast
+    return _maybe_wrap_result(self._fsproxy_wrapped.dtype, None)
+
+
 Series = make_final_proxy_type(
     "Series",
     cudf.Series,
@@ -285,6 +293,7 @@ def custom_repr_html(obj):
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"),
         "_accessors": set(),
+        "dtype": _Series_dtype,
     },
 )
 
@@ -1704,6 +1713,10 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
     )
 
 
+def isinstance_cudf_pandas(obj, type):
+    return is_proxy_object(obj) and obj.__class__.__name__ == type.__name__
+
+
 # timestamps and timedeltas are not proxied, but non-proxied
 # pandas types are currently not picklable. Thus, we define
 # custom reducer/unpicker functions for these types:
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index d32d388b975..c189280be09 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -204,6 +204,12 @@ def _fsproxy_fast_to_slow(self):
             return fast_to_slow(self._fsproxy_wrapped)
         return self._fsproxy_wrapped
 
+    def as_gpu_object(self):
+        return self._fsproxy_slow_to_fast()
+
+    def as_cpu_object(self):
+        return self._fsproxy_fast_to_slow()
+
     @property  # type: ignore
     def _fsproxy_state(self) -> _State:
         return (
@@ -221,6 +227,8 @@ def _fsproxy_state(self) -> _State:
         "_fsproxy_slow_type": slow_type,
         "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast,
         "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow,
+        "as_gpu_object": as_gpu_object,
+        "as_cpu_object": as_cpu_object,
         "_fsproxy_state": _fsproxy_state,
     }
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 1528ed2973b..17225ba893f 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -65,6 +65,10 @@
     get_calendar,
 )
 
+from cudf.pandas import (
+    isinstance_cudf_pandas,
+)
+
 # Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
 cudf = xpd._fsproxy_fast
@@ -1885,3 +1889,33 @@ def test_dataframe_setitem():
     new_df = df + 1
     df[df.columns] = new_df
     tm.assert_equal(df, new_df)
+
+
+def test_dataframe_get_fast_slow_methods():
+    df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    assert isinstance(df.as_gpu_object(), cudf.DataFrame)
+    assert isinstance(df.as_cpu_object(), pd.DataFrame)
+
+
+def test_is_cudf_pandas():
+    s = xpd.Series([1, 2, 3])
+    df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    index = xpd.Index([1, 2, 3])
+
+    assert isinstance_cudf_pandas(s, pd.Series)
+    assert isinstance_cudf_pandas(df, pd.DataFrame)
+    assert isinstance_cudf_pandas(index, pd.Index)
+    assert isinstance_cudf_pandas(index.values, np.ndarray)
+
+    for obj in [s, df, index, index.values]:
+        assert not isinstance_cudf_pandas(obj._fsproxy_slow, pd.Series)
+        assert not isinstance_cudf_pandas(obj._fsproxy_fast, pd.Series)
+
+        assert not isinstance_cudf_pandas(obj._fsproxy_slow, pd.DataFrame)
+        assert not isinstance_cudf_pandas(obj._fsproxy_fast, pd.DataFrame)
+
+        assert not isinstance_cudf_pandas(obj._fsproxy_slow, pd.Index)
+        assert not isinstance_cudf_pandas(obj._fsproxy_fast, pd.Index)
+
+        assert not isinstance_cudf_pandas(obj._fsproxy_slow, np.ndarray)
+        assert not isinstance_cudf_pandas(obj._fsproxy_fast, np.ndarray)

From 98e1696dff737b15b83f71242a23412c89bad0e4 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 29 Jan 2025 09:26:43 -0500
Subject: [PATCH 33/35] Add `verify-codeowners` hook (#17840)

Issue: https://github.com/rapidsai/pre-commit-hooks/issues/61

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17840
---
 .github/CODEOWNERS      | 12 ++++++------
 .pre-commit-config.yaml |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e0b315f34fc..42c0dce4811 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -18,11 +18,11 @@ java/              @rapidsai/cudf-java-codeowners
 #CI code owners
 /.github/                @rapidsai/ci-codeowners
 /ci/                     @rapidsai/ci-codeowners
-/.pre-commit-config.yaml @rapidsai/ci-codeowners
 
 #packaging code owners
-/.devcontainer/    @rapidsai/packaging-codeowners
-/conda/            @rapidsai/packaging-codeowners
-/dependencies.yaml @rapidsai/packaging-codeowners
-/build.sh          @rapidsai/packaging-codeowners
-pyproject.toml     @rapidsai/packaging-codeowners
+/.pre-commit-config.yaml @rapidsai/packaging-codeowners
+/.devcontainer/          @rapidsai/packaging-codeowners
+/conda/                  @rapidsai/packaging-codeowners
+dependencies.yaml        @rapidsai/packaging-codeowners
+/build.sh                @rapidsai/packaging-codeowners
+pyproject.toml           @rapidsai/packaging-codeowners
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 718353d48e9..0a124cac2b3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -152,7 +152,7 @@ repos:
         args: ["--fix"]
       - id: ruff-format
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.4.0
+    rev: v0.5.0
     hooks:
       - id: verify-copyright
         exclude: |
@@ -163,6 +163,8 @@ repos:
             cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
           )
       - id: verify-alpha-spec
+      - id: verify-codeowners
+        args: [--fix, --project-prefix=cudf]
   - repo: https://github.com/rapidsai/dependency-file-generator
     rev: v1.17.0
     hooks:

From a6f90f0737d6306a364671dee59c05a2cf3d33b4 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 29 Jan 2025 08:35:23 -0600
Subject: [PATCH 34/35] Add multi-partition `Shuffle` operation to cuDF Polars
 (#17744)

This PR pulls out the `Shuffle` logic from https://github.com/rapidsai/cudf/pull/17518 to simplify the review process.

The goal is to establish the shuffle groundwork for multi-partition `Join` and `Sort` operations.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17744
---
 .../cudf_polars/experimental/base.py          |  26 ++-
 .../cudf_polars/experimental/parallel.py      |   5 +-
 .../cudf_polars/experimental/shuffle.py       | 204 ++++++++++++++++++
 .../tests/experimental/test_shuffle.py        |  66 ++++++
 4 files changed, 289 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/experimental/shuffle.py
 create mode 100644 python/cudf_polars/tests/experimental/test_shuffle.py

diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
index 8f660632df2..36c7745c3f4 100644
--- a/python/cudf_polars/cudf_polars/experimental/base.py
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition base classes."""
 
@@ -12,20 +12,26 @@
     from collections.abc import Iterator, Sequence
 
     from cudf_polars.containers import DataFrame
+    from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.nodebase import Node
 
 
 class PartitionInfo:
-    """
-    Partitioning information.
-
-    This class only tracks the partition count (for now).
-    """
-
-    __slots__ = ("count",)
-
-    def __init__(self, count: int):
+    """Partitioning information."""
+
+    __slots__ = ("count", "partitioned_on")
+    count: int
+    """Partition count."""
+    partitioned_on: tuple[NamedExpr, ...]
+    """Columns the data is hash-partitioned on."""
+
+    def __init__(
+        self,
+        count: int,
+        partitioned_on: tuple[NamedExpr, ...] = (),
+    ):
         self.count = count
+        self.partitioned_on = partitioned_on
 
     def keys(self, node: Node) -> Iterator[tuple[str, int]]:
         """Return the partitioned keys for a given node."""
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 6843ed9ee2e..5a5eaab8b2f 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition Dask execution."""
 
@@ -10,7 +10,8 @@
 from typing import TYPE_CHECKING, Any
 
 import cudf_polars.experimental.io
-import cudf_polars.experimental.select  # noqa: F401
+import cudf_polars.experimental.select
+import cudf_polars.experimental.shuffle  # noqa: F401
 from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
 from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py
new file mode 100644
index 00000000000..d49f13375ed
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py
@@ -0,0 +1,204 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Shuffle Logic."""
+
+from __future__ import annotations
+
+import json
+import operator
+from typing import TYPE_CHECKING, Any
+
+import pyarrow as pa
+
+import pylibcudf as plc
+
+from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.ir import IR
+from cudf_polars.experimental.base import _concat, get_key_name
+from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, MutableMapping
+
+    from cudf_polars.dsl.expr import NamedExpr
+    from cudf_polars.experimental.dispatch import LowerIRTransformer
+    from cudf_polars.experimental.parallel import PartitionInfo
+    from cudf_polars.typing import Schema
+
+
+class Shuffle(IR):
+    """
+    Shuffle multi-partition data.
+
+    Notes
+    -----
+    Only hash-based partitioning is supported (for now).
+    """
+
+    __slots__ = ("keys", "options")
+    _non_child = ("schema", "keys", "options")
+    keys: tuple[NamedExpr, ...]
+    """Keys to shuffle on."""
+    options: dict[str, Any]
+    """Shuffling options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: tuple[NamedExpr, ...],
+        options: dict[str, Any],
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = keys
+        self.options = options
+        self._non_child_args = (schema, keys, options)
+        self.children = (df,)
+
+    def get_hashable(self) -> Hashable:
+        """Hashable representation of the node."""
+        return (
+            type(self),
+            tuple(self.schema.items()),
+            self.keys,
+            json.dumps(self.options),
+            self.children,
+        )
+
+    @classmethod
+    def do_evaluate(
+        cls,
+        schema: Schema,
+        keys: tuple[NamedExpr, ...],
+        options: dict[str, Any],
+        df: DataFrame,
+    ):  # pragma: no cover
+        """Evaluate and return a dataframe."""
+        # Single-partition Shuffle evaluation is a no-op
+        return df
+
+
+def _partition_dataframe(
+    df: DataFrame,
+    keys: tuple[NamedExpr, ...],
+    count: int,
+) -> dict[int, DataFrame]:
+    """
+    Partition an input DataFrame for shuffling.
+
+    Notes
+    -----
+    This utility only supports hash partitioning (for now).
+
+    Parameters
+    ----------
+    df
+        DataFrame to partition.
+    keys
+        Shuffle key(s).
+    count
+        Total number of output partitions.
+
+    Returns
+    -------
+    A dictionary mapping between int partition indices and
+    DataFrame fragments.
+    """
+    # Hash the specified keys to calculate the output
+    # partition for each row
+    partition_map = plc.binaryop.binary_operation(
+        plc.hashing.murmurhash3_x86_32(
+            DataFrame([expr.evaluate(df) for expr in keys]).table
+        ),
+        plc.interop.from_arrow(pa.scalar(count, type="uint32")),
+        plc.binaryop.BinaryOperator.PYMOD,
+        plc.types.DataType(plc.types.TypeId.UINT32),
+    )
+
+    # Apply partitioning
+    t, offsets = plc.partitioning.partition(
+        df.table,
+        partition_map,
+        count,
+    )
+
+    # Split and return the partitioned result
+    return {
+        i: DataFrame.from_table(
+            split,
+            df.column_names,
+        )
+        for i, split in enumerate(plc.copying.split(t, offsets[1:-1]))
+    }
+
+
+def _simple_shuffle_graph(
+    name_out: str,
+    name_in: str,
+    keys: tuple[NamedExpr, ...],
+    count_in: int,
+    count_out: int,
+) -> MutableMapping[Any, Any]:
+    """Make a simple all-to-all shuffle graph."""
+    split_name = f"split-{name_out}"
+    inter_name = f"inter-{name_out}"
+
+    graph: MutableMapping[Any, Any] = {}
+    for part_out in range(count_out):
+        _concat_list = []
+        for part_in in range(count_in):
+            graph[(split_name, part_in)] = (
+                _partition_dataframe,
+                (name_in, part_in),
+                keys,
+                count_out,
+            )
+            _concat_list.append((inter_name, part_out, part_in))
+            graph[_concat_list[-1]] = (
+                operator.getitem,
+                (split_name, part_in),
+                part_out,
+            )
+        graph[(name_out, part_out)] = (_concat, _concat_list)
+    return graph
+
+
+@lower_ir_node.register(Shuffle)
+def _(
+    ir: Shuffle, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Simple lower_ir_node handling for the default hash-based shuffle.
+    # More-complex logic (e.g. joining and sorting) should
+    # be handled separately.
+    from cudf_polars.experimental.parallel import PartitionInfo
+
+    (child,) = ir.children
+
+    new_child, pi = rec(child)
+    if pi[new_child].count == 1 or ir.keys == pi[new_child].partitioned_on:
+        # Already shuffled
+        return new_child, pi
+    new_node = ir.reconstruct([new_child])
+    pi[new_node] = PartitionInfo(
+        # Default shuffle preserves partition count
+        count=pi[new_child].count,
+        # Add partitioned_on info
+        partitioned_on=ir.keys,
+    )
+    return new_node, pi
+
+
+@generate_ir_tasks.register(Shuffle)
+def _(
+    ir: Shuffle, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    # Use a simple all-to-all shuffle graph.
+
+    # TODO: Optionally use rapidsmp.
+    return _simple_shuffle_graph(
+        get_key_name(ir),
+        get_key_name(ir.children[0]),
+        ir.keys,
+        partition_info[ir.children[0]].count,
+        partition_info[ir].count,
+    )
diff --git a/python/cudf_polars/tests/experimental/test_shuffle.py b/python/cudf_polars/tests/experimental/test_shuffle.py
new file mode 100644
index 00000000000..294557fd0d6
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_shuffle.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import Translator
+from cudf_polars.dsl.expr import Col, NamedExpr
+from cudf_polars.experimental.parallel import evaluate_dask, lower_ir_graph
+from cudf_polars.experimental.shuffle import Shuffle
+
+
+@pytest.fixture(scope="module")
+def engine():
+    return pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 4},
+    )
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "x": [1, 2, 3, 4, 5, 6, 7],
+            "y": [1, 1, 1, 1, 1, 1, 1],
+            "z": ["a", "b", "c", "d", "e", "f", "g"],
+        }
+    )
+
+
+def test_hash_shuffle(df, engine):
+    # Extract translated IR
+    qir = Translator(df._ldf.visit(), engine).translate_ir()
+
+    # Add first Shuffle node
+    keys = (NamedExpr("x", Col(qir.schema["x"], "x")),)
+    options = {}
+    qir1 = Shuffle(qir.schema, keys, options, qir)
+
+    # Add second Shuffle node (on the same keys)
+    qir2 = Shuffle(qir.schema, keys, options, qir1)
+
+    # Check that sequential shuffles on the same keys
+    # are replaced with a single shuffle node
+    partition_info = lower_ir_graph(qir2)[1]
+    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 1
+
+    # Add second Shuffle node (on different keys)
+    keys2 = (NamedExpr("z", Col(qir.schema["z"], "z")),)
+    qir3 = Shuffle(qir2.schema, keys2, options, qir2)
+
+    # Check that we have an additional shuffle
+    # node after shuffling on different keys
+    partition_info = lower_ir_graph(qir3)[1]
+    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 2
+
+    # Check that Dask evaluation works
+    result = evaluate_dask(qir3).to_polars()
+    expect = df.collect(engine="cpu")
+    assert_frame_equal(result, expect, check_row_order=False)

From 3d0725bfd112d5f8a0e6a91a82edfb2518a33f18 Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Wed, 29 Jan 2025 09:38:03 -0500
Subject: [PATCH 35/35] Add shellcheck to pre-commit and fix warnings (#17778)

`shellcheck` is a fast, static analysis tool for shell scripts. It's good at
flagging up unused variables, unintentional glob expansions, and other potential
execution and security headaches that arise from the wonders of `bash` (and
other shlangs).

This PR adds a `pre-commit` hook to run `shellcheck` on all of the `sh-lang`
files in the `ci/` directory, and the changes requested by `shellcheck` to make
the existing files pass the check.

xref: rapidsai/build-planning#135

Authors:
  - Gil Forsyth (https://github.com/gforsyth)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17778
---
 .pre-commit-config.yaml                       |  6 ++++++
 ci/build_docs.sh                              | 11 ++++++----
 ci/build_wheel_cudf.sh                        |  4 ++--
 ci/build_wheel_cudf_polars.sh                 |  4 ++--
 ci/build_wheel_dask_cudf.sh                   |  4 ++--
 ci/build_wheel_libcudf.sh                     |  2 +-
 ci/build_wheel_pylibcudf.sh                   |  4 ++--
 ci/check_style.sh                             |  6 +++---
 ci/checks/doxygen.sh                          | 16 ++++++++-------
 ci/cpp_linters.sh                             |  4 +---
 ci/cudf_pandas_scripts/pandas-tests/diff.sh   |  5 ++---
 ci/cudf_pandas_scripts/pandas-tests/run.sh    | 18 ++++++++---------
 ci/cudf_pandas_scripts/run_tests.sh           | 13 ++++++------
 .../run-library-tests.sh                      | 20 +++++++++----------
 .../third-party-integration/test.sh           | 16 ++++++---------
 ci/release/update-version.sh                  | 16 +++++++--------
 ci/run_cudf_examples.sh                       |  6 ++++--
 ci/run_cudf_memcheck_ctests.sh                |  6 +++---
 ci/run_cudf_polars_polars_tests.sh            | 10 +++++++---
 ci/test_cpp.sh                                |  8 ++++----
 ci/test_cpp_memcheck.sh                       |  3 ++-
 ci/test_cudf_polars_polars_tests.sh           | 13 ++++++------
 ci/test_notebooks.sh                          | 12 ++++++-----
 ci/test_python_common.sh                      |  4 ++--
 ci/test_wheel_cudf.sh                         | 10 +++++-----
 ci/test_wheel_cudf_polars.sh                  | 11 +++++-----
 ci/test_wheel_dask_cudf.sh                    | 10 +++++-----
 ci/utils/nbtest.sh                            | 20 +++++++++----------
 ci/validate_wheel.sh                          |  6 +++---
 29 files changed, 140 insertions(+), 128 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0a124cac2b3..08187dbd1f5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -170,6 +170,12 @@ repos:
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+        args: ["--severity=warning"]
+        files: ^ci/
 
 default_language_version:
       python: python3
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index e8a054842c8..c24a58b0232 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -3,8 +3,10 @@
 
 set -euo pipefail
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+export RAPIDS_VERSION
+export RAPIDS_VERSION_MAJOR_MINOR
 
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
@@ -33,7 +35,8 @@ rapids-mamba-retry install \
   "cudf=${RAPIDS_VERSION}" \
   "dask-cudf=${RAPIDS_VERSION}"
 
-export RAPIDS_DOCS_DIR="$(mktemp -d)"
+RAPIDS_DOCS_DIR="$(mktemp -d)"
+export RAPIDS_DOCS_DIR
 
 EXITCODE=0
 trap "EXITCODE=1" ERR
@@ -41,7 +44,7 @@ set +e
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/"${RAPIDS_VERSION_MAJOR_MINOR}"/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 32dd5a7fa62..0f373547660 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/cudf"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 # Downloads libcudf and pylibcudf wheels from this current build,
 # then ensures 'cudf' wheel builds always use the 'libcudf' and 'pylibcudf' just built in the same CI run.
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
index 38048125247..898ef3e3f45 100755
--- a/ci/build_wheel_cudf_polars.sh
+++ b/ci/build_wheel_cudf_polars.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -8,5 +8,5 @@ package_dir="python/cudf_polars"
 ./ci/build_wheel.sh cudf-polars ${package_dir}
 ./ci/validate_wheel.sh ${package_dir} dist
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index b0ae2f23abc..168625f1205 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -8,5 +8,5 @@ package_dir="python/dask_cudf"
 ./ci/build_wheel.sh dask-cudf ${package_dir}
 ./ci/validate_wheel.sh ${package_dir} dist
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index d80e4fef0d0..11fc5058500 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -6,7 +6,7 @@ set -euo pipefail
 package_name="libcudf"
 package_dir="python/libcudf"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 rapids-logger "Generating build requirements"
 
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 5a8f3397714..9091f59d57b 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/pylibcudf"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 # Downloads libcudf wheel from this current build,
 # then ensures 'pylibcudf' wheel builds always use the 'libcudf' just built in the same CI run.
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 634d8b0d702..70daff8f504 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -20,8 +20,8 @@ RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 
 FORMAT_FILE_URL="https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json"
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
-mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
-wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
+mkdir -p "$(dirname "${RAPIDS_CMAKE_FORMAT_FILE}")"
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} "${FORMAT_FILE_URL}"
 
 # Run pre-commit checks
 pre-commit run --all-files --show-diff-on-failure
diff --git a/ci/checks/doxygen.sh b/ci/checks/doxygen.sh
index f4d97f91aa8..6870a1beac5 100755
--- a/ci/checks/doxygen.sh
+++ b/ci/checks/doxygen.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 ###############################
 # cuDF doxygen warnings check #
 ###############################
@@ -14,8 +14,8 @@ fi
 function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; }
 
 # doxygen supported version 1.9.1
-DOXYGEN_VERSION=`doxygen --version`
-if [ ! $(version "$DOXYGEN_VERSION") -eq $(version "1.9.1") ] ; then
+DOXYGEN_VERSION=$(doxygen --version)
+if [ ! "$(version "$DOXYGEN_VERSION")" -eq "$(version "1.9.1")" ] ; then
   echo -e "warning: Unsupported doxygen version $DOXYGEN_VERSION"
   echo -e "Expecting doxygen version 1.9.1"
   exit 0
@@ -23,16 +23,18 @@ fi
 
 # Set variables for doxygen
 # We can't use gha-tools' rapids-version and rapids-version-major-minor here because this script can run outside of CI
-export RAPIDS_VERSION="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/" VERSION)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2/" VERSION)"
+RAPIDS_VERSION="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/" VERSION)"
+RAPIDS_VERSION_MAJOR_MINOR="$(sed -E -e "s/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2/" VERSION)"
+export RAPIDS_VERSION
+export RAPIDS_VERSION_MAJOR_MINOR
 
 # Run doxygen, ignore missing tag files error
 TAG_ERROR1="error: Tag file '.*.tag' does not exist or is not a file. Skipping it..."
 TAG_ERROR2="error: cannot open tag file .*.tag for writing"
-DOXYGEN_STDERR=`cd cpp/doxygen && { cat Doxyfile ; echo QUIET = YES; echo GENERATE_HTML = NO; }  | doxygen - 2>&1 | sed "/\($TAG_ERROR1\|$TAG_ERROR2\)/d"`
+DOXYGEN_STDERR=$(cd cpp/doxygen && { cat Doxyfile ; echo QUIET = YES; echo GENERATE_HTML = NO; }  | doxygen - 2>&1 | sed "/\($TAG_ERROR1\|$TAG_ERROR2\)/d")
 RETVAL=$?
 
-if [ "$RETVAL" != "0" ] || [ ! -z "$DOXYGEN_STDERR" ]; then
+if [ "$RETVAL" != "0" ] || [ -n "$DOXYGEN_STDERR" ]; then
   echo -e "$DOXYGEN_STDERR"
   RETVAL=1 #because return value is not generated by doxygen 1.8.20
 fi
diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh
index 9702b055512..95b153061d2 100755
--- a/ci/cpp_linters.sh
+++ b/ci/cpp_linters.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -20,8 +20,6 @@ set +u
 conda activate clang_tidy
 set -u
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-
 source rapids-configure-sccache
 
 # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled.
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 5dbb4ba991c..bdf695d5eb4 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -13,7 +13,6 @@ rapids-logger "Github job name: ${GH_JOB_NAME}"
 rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
 PY_VER="310"
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
@@ -22,7 +21,7 @@ COMPARE_ENV=$(tail -n 1 s3_output.txt)
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
 
 aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json
-aws s3 cp $PR_ARTIFACT pr-results.json
+aws s3 cp "$PR_ARTIFACT" pr-results.json
 
 # Compute the diff and prepare job summary:
 python -m pip install pandas tabulate
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index e5cd4436a3a..15970a4185c 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,7 +10,7 @@ RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 # Download the cudf, libcudf, and pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
@@ -19,9 +19,9 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
-  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \
-  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
+  "$(echo ./dist/cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,pandas-tests]" \
+  "$(echo ./dist/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+  "$(echo ./dist/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
@@ -34,12 +34,12 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --max-worker-restart=3 \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-pandas.xml" \
   --dist worksteal \
-  --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
+  --report-log="${PANDAS_TESTS_BRANCH}.json" 2>&1
 
 SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
 # summarize the results and save them to artifacts:
-python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
+python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/"${PANDAS_TESTS_BRANCH}.json" > "pandas-testing/${SUMMARY_FILE_NAME}"
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
 mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
-mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
-rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"
+mv pandas-testing/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}"/
+rapids-upload-to-s3 "${RAPIDS_ARTIFACTS_DIR}"/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}"
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 61361fffb07..2c8d5969bdc 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,7 +10,6 @@ RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
 mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
 
 DEPENDENCIES_PATH="dependencies.yaml"
-package_name="pandas"
 
 # Use grep to find the line containing the package name and version constraint
 pandas_version_constraint=$(grep -oP "pandas>=\d+\.\d+,\<\d+\.\d+\.\d+dev\d+" $DEPENDENCIES_PATH)
@@ -47,7 +46,7 @@ done
 if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
-    RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+    RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
     # Download the cudf, libcudf, and pylibcudf built in the previous step
     RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
@@ -60,9 +59,9 @@ else
     python -m pip install \
         -v \
         --constraint ./constraints.txt \
-        "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
-        "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-        "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
+        "$(echo ./dist/cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,cudf-pandas-tests]" \
+        "$(echo ./dist/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+        "$(echo ./dist/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 fi
 
 python -m pip install ipykernel
@@ -77,7 +76,7 @@ python -m pytest -p cudf.pandas \
     --cov-report=term \
     ./python/cudf/cudf_pandas_tests/
 
-output=$(python ci/cudf_pandas_scripts/fetch_pandas_versions.py $pandas_version_constraint)
+output=$(python ci/cudf_pandas_scripts/fetch_pandas_versions.py "$pandas_version_constraint")
 
 # Convert the comma-separated list into an array
 IFS=',' read -r -a versions <<< "$output"
diff --git a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
index d44d25d658c..ce3291b864a 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 cleanup() {
-    rm ${TEST_DIR}/results-*.pickle
+    rm "${TEST_DIR}"/results-*.pickle
 }
 
 trap cleanup EXIT
@@ -19,21 +19,21 @@ runtest() {
     fi
 
     pytest \
-    $plugin \
+    "$plugin" \
     -v \
     --continue-on-collection-errors \
     --cache-clear \
-    --numprocesses=${NUM_PROCESSES} \
+    --numprocesses="${NUM_PROCESSES}" \
     --dist=worksteal \
-    ${TEST_DIR}/test_${lib}*.py
+    "${TEST_DIR}"/test_"${lib}"*.py
 }
 
 main() {
     local lib=$1
 
     # generation phase
-    runtest ${lib} "gold"
-    runtest ${lib} "cudf"
+    runtest "${lib}" "gold"
+    runtest "${lib}" "cudf"
 
     # assertion phase
     pytest \
@@ -42,9 +42,9 @@ main() {
     -v \
     --continue-on-collection-errors \
     --cache-clear \
-    --numprocesses=${NUM_PROCESSES} \
+    --numprocesses="${NUM_PROCESSES}" \
     --dist=worksteal \
-    ${TEST_DIR}/test_${lib}*.py
+    "${TEST_DIR}"/test_"${lib}"*.py
 }
 
-main $@
+main "$@"
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index 30e3ffc9a43..43ed3594917 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -1,22 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 # Common setup steps shared by Python test jobs
 
 set -euo pipefail
 
-write_output() {
-  local key="$1"
-  local value="$2"
-  echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
-}
 
 extract_lib_from_dependencies_yaml() {
     local file=$1
     # Parse all keys in dependencies.yaml under the "files" section,
     # extract all the keys that start with "test_", and extract the rest
-    local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')"
-    echo $extracted_libs
+    extracted_libs="$(yq -o json "$file" | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')"
+    local extracted_libs
+    echo "$extracted_libs"
 }
 
 main() {
@@ -40,7 +36,7 @@ main() {
         rapids-dependency-file-generator \
           --config "$dependencies_yaml" \
           --output conda \
-          --file-key test_${lib} \
+          --file-key "test_${lib}" \
           --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
         rapids-mamba-retry env create --yes -f env.yaml -n test
@@ -74,7 +70,7 @@ main() {
         trap "EXITCODE=1" ERR
         set +e
 
-        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib}
+        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh "${lib}"
 
         set -e
         rapids-logger "Test script exiting with value: ${EXITCODE}"
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 95f36653c2c..f4f31dfbb6f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 ########################
 # cuDF Version Updater #
 ########################
@@ -13,19 +13,17 @@ NEXT_FULL_TAG=$1
 
 # Get current version
 CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v')
-CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}')
-CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
-CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
+CURRENT_MAJOR=$(echo "$CURRENT_TAG" | awk '{split($0, a, "."); print a[1]}')
+CURRENT_MINOR=$(echo "$CURRENT_TAG" | awk '{split($0, a, "."); print a[2]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
 # Get <major>.<minor> for next version
-NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
-NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
-NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
+NEXT_MAJOR=$(echo "$NEXT_FULL_TAG" | awk '{split($0, a, "."); print a[1]}')
+NEXT_MINOR=$(echo "$NEXT_FULL_TAG" | awk '{split($0, a, "."); print a[2]}')
+NEXT_PATCH=$(echo "$NEXT_FULL_TAG" | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 
 # Need to distutils-normalize the versions for some use cases
-CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")
 
@@ -33,7 +31,7 @@ echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
 # Inplace sed replace; workaround for Linux and Mac
 function sed_runner() {
-    sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
+    sed -i.bak ''"$1"'' "$2" && rm -f "${2}".bak
 }
 
 # Centralized version file update
diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 2439af5b644..adacc304139 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -uo pipefail
 
@@ -7,9 +7,11 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 
 # Support customizing the examples' install location
-cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/" || exit
 
 # compute-sanitizer not available before CUDA 11.6
+# Using -lt with decimals doesn't work in `bash` _except_ when comparing version strings
+# shellcheck disable=SC2072
 if [[ "${RAPIDS_CUDA_VERSION%.*}" < "11.6" ]]; then
   echo "computer-sanitizer unavailable pre 11.6"
   exit 0
diff --git a/ci/run_cudf_memcheck_ctests.sh b/ci/run_cudf_memcheck_ctests.sh
index 391579b6c59..fac95b5ad8d 100755
--- a/ci/run_cudf_memcheck_ctests.sh
+++ b/ci/run_cudf_memcheck_ctests.sh
@@ -7,17 +7,17 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 
 # Support customizing the ctests' install location
-cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcudf/";
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcudf/" || exit
 
 export GTEST_CUDF_RMM_MODE=cuda
 export GTEST_BRIEF=1
 # compute-sanitizer bug 4553815
 export LIBCUDF_MEMCHECK_ENABLED=1
 for gt in ./*_TEST ; do
-  test_name=$(basename ${gt})
+  test_name=$(basename "${gt}")
   # Run gtests with compute-sanitizer
   echo "Running compute-sanitizer on $test_name"
-  compute-sanitizer --tool memcheck ${gt} "$@"
+  compute-sanitizer --tool memcheck "${gt}" "$@"
 done
 unset GTEST_BRIEF
 unset GTEST_CUDF_RMM_MODE
diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
index c851f65d4f6..dfabe6093a9 100755
--- a/ci/run_cudf_polars_polars_tests.sh
+++ b/ci/run_cudf_polars_polars_tests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -38,7 +38,11 @@ else
     fi
 fi
 
-DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
+DESELECTED_TESTS_STR=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
+
+# Don't quote the `DESELECTED_...` variable because `pytest` can't handle
+# multiple quoted arguments inline
+# shellcheck disable=SC2086
 python -m pytest \
        --import-mode=importlib \
        --cache-clear \
@@ -46,6 +50,6 @@ python -m pytest \
        -p cudf_polars.testing.plugin \
        -v \
        --tb=native \
-       ${DESELECTED_TESTS} \
+       $DESELECTED_TESTS_STR \
        "$@" \
        py-polars/tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 7865849bb74..8f33a78948b 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 # Support invoking test_cpp.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
@@ -17,13 +17,13 @@ rapids-logger "Run libcudf gtests"
 ./ci/run_cudf_ctests.sh -j20
 SUITEERROR=$?
 
-if (( ${SUITEERROR} == 0 )); then
+if (( SUITEERROR == 0 )); then
     rapids-logger "Run libcudf examples"
     ./ci/run_cudf_examples.sh
     SUITEERROR=$?
 fi
 
-if (( ${SUITEERROR} == 0 )); then
+if (( SUITEERROR == 0 )); then
     rapids-logger "Run libcudf_kafka gtests"
     ./ci/run_cudf_kafka_ctests.sh -j20
     SUITEERROR=$?
@@ -32,7 +32,7 @@ fi
 # Ensure that benchmarks are runnable
 rapids-logger "Run tests of libcudf benchmarks"
 
-if (( ${SUITEERROR} == 0 )); then
+if (( SUITEERROR == 0 )); then
     ./ci/run_cudf_benchmark_smoketests.sh
     SUITEERROR=$?
 fi
diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh
index fda11c64155..6317e7e2730 100755
--- a/ci/test_cpp_memcheck.sh
+++ b/ci/test_cpp_memcheck.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 # Support invoking test_cpp.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
@@ -11,4 +11,5 @@ rapids-logger "Memcheck gtests with rmm_mode=cuda"
 ./ci/run_cudf_memcheck_ctests.sh && EXITCODE=$? || EXITCODE=$?;
 
 rapids-logger "Test script exiting with value: $EXITCODE"
+# shellcheck disable=SC2086
 exit ${EXITCODE}
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index fefe26984cb..2f6fd6faca7 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
 rapids-logger "Download wheels"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download libcudf and pylibcudf built in the previous step
@@ -15,19 +15,20 @@ RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 rapids-logger "Install libcudf, pylibcudf and cudf_polars"
 python -m pip install \
     -v \
-    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
-    "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-    "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
+    "$(echo ./dist/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" \
+    "$(echo ./local-libcudf-dep/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+    "$(echo ./local-pylibcudf-dep/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
 
 TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"
-git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
+git clone https://github.com/pola-rs/polars.git --branch "${TAG}" --depth 1
 
 # Install requirements for running polars tests
 rapids-logger "Install polars test requirements"
 python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt
 
+# shellcheck disable=SC2317
 function set_exitcode()
 {
     EXITCODE=$?
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index 4197dc5617f..329246ef9d7 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -45,11 +45,13 @@ SKIPNBS="performance-comparisons.ipynb"
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
+# Loops over `find` are fragile but this seems to be working
+# shellcheck disable=SC2044
 for nb in $(find . -name "*.ipynb"); do
-    nbBasename=$(basename ${nb})
+    nbBasename=$(basename "${nb}")
     # Skip all notebooks that use dask (in the code or even in their name)
-    if ((echo ${nb} | grep -qi dask) || \
-        (grep -q dask ${nb})); then
+    if (echo "${nb}" | grep -qi dask) || \
+        (grep -q dask "${nb}"); then
         echo "--------------------------------------------------------------------------------"
         echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)"
         echo "--------------------------------------------------------------------------------"
@@ -59,7 +61,7 @@ for nb in $(find . -name "*.ipynb"); do
         echo "--------------------------------------------------------------------------------"
     else
         nvidia-smi
-        ${NBTEST} ${nbBasename}
+        ${NBTEST} "${nbBasename}"
     fi
 done
 
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index 4327bfff3af..c6c6a3957b9 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 # Common setup steps shared by Python test jobs
 
@@ -15,7 +15,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 FILE_KEY=$1
 rapids-dependency-file-generator \
   --output conda \
-  --file-key ${FILE_KEY} \
+  --file-key "${FILE_KEY}" \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
     | tee "${ENV_YAML_DIR}/env.yaml"
 
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index ce12744c9e3..6eaaf7bb657 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 
 # Download the cudf, libcudf, and pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
@@ -19,9 +19,9 @@ rapids-generate-pip-constraints py_test_cudf ./constraints.txt
 python -m pip install \
     -v \
     --constraint ./constraints.txt \
-  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
-  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
+  "$(echo ./dist/cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" \
+  "$(echo ./dist/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+  "$(echo ./dist/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 3f818867d49..5827077e826 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
 rapids-logger "Download wheels"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
 # Download libcudf and pylibcudf built in the previous step
@@ -21,12 +21,13 @@ rapids-generate-pip-constraints py_test_cudf_polars ./constraints.txt
 python -m pip install \
     -v \
     --constraint ./constraints.txt \
-    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,experimental]" \
-    "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-    "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
+    "$(echo ./dist/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,experimental]" \
+    "$(echo ./dist/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+    "$(echo ./dist/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
 rapids-logger "Run cudf_polars tests"
 
+# shellcheck disable=SC2317
 function set_exitcode()
 {
     EXITCODE=$?
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 44f430ce98d..41c59ef59d1 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -3,7 +3,7 @@
 
 set -eou pipefail
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
 # Download the cudf, libcudf, and pylibcudf built in the previous step
@@ -20,10 +20,10 @@ rapids-generate-pip-constraints py_test_dask_cudf ./constraints.txt
 python -m pip install \
   -v \
   --constraint ./constraints.txt \
-  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-  "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
-  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
-  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
+  "$(echo ./dist/cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+  "$(echo ./dist/dask_cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" \
+  "$(echo ./dist/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
+  "$(echo ./dist/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh
index 2a94e2d0695..9fc37d25697 100755
--- a/ci/utils/nbtest.sh
+++ b/ci/utils/nbtest.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 MAGIC_OVERRIDE_CODE="
 def my_run_line_magic(*args, **kwargs):
@@ -24,22 +24,22 @@ get_ipython().run_cell_magic=my_run_cell_magic
 NO_COLORS=--colors=NoColor
 EXITCODE=0
 NBTMPDIR="$WORKSPACE/tmp"
-mkdir -p ${NBTMPDIR}
+mkdir -p "${NBTMPDIR}"
 
-for nb in $*; do
-    NBFILENAME=$1
+for nb in "$@"; do
+    NBFILENAME=$nb
     NBNAME=${NBFILENAME%.*}
     NBNAME=${NBNAME##*/}
     NBTESTSCRIPT=${NBTMPDIR}/${NBNAME}-test.py
     shift
 
     echo --------------------------------------------------------------------------------
-    echo STARTING: ${NBNAME}
+    echo STARTING: "${NBNAME}"
     echo --------------------------------------------------------------------------------
-    jupyter nbconvert --to script ${NBFILENAME} --output ${NBTMPDIR}/${NBNAME}-test
-    echo "${MAGIC_OVERRIDE_CODE}" > ${NBTMPDIR}/tmpfile
-    cat ${NBTESTSCRIPT} >> ${NBTMPDIR}/tmpfile
-    mv ${NBTMPDIR}/tmpfile ${NBTESTSCRIPT}
+    jupyter nbconvert --to script "${NBFILENAME}" --output "${NBTMPDIR}"/"${NBNAME}"-test
+    echo "${MAGIC_OVERRIDE_CODE}" > "${NBTMPDIR}"/tmpfile
+    cat "${NBTESTSCRIPT}" >> "${NBTMPDIR}"/tmpfile
+    mv "${NBTMPDIR}"/tmpfile "${NBTESTSCRIPT}"
 
     echo "Running \"ipython ${NO_COLORS} ${NBTESTSCRIPT}\" on $(date)"
     echo
@@ -47,7 +47,7 @@ for nb in $*; do
     NBEXITCODE=$?
     echo EXIT CODE: ${NBEXITCODE}
     echo
-    EXITCODE=$((EXITCODE | ${NBEXITCODE}))
+    EXITCODE=$((EXITCODE | "${NBEXITCODE}"))
 done
 
 exit ${EXITCODE}
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index 5910a5c59fe..40bb27feeb3 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -12,10 +12,10 @@ rapids-logger "validate packages with 'pydistcheck'"
 
 pydistcheck \
     --inspect \
-    "$(echo ${wheel_dir_relative_path}/*.whl)"
+    "$(echo "${wheel_dir_relative_path}"/*.whl)"
 
 rapids-logger "validate packages with 'twine'"
 
 twine check \
     --strict \
-    "$(echo ${wheel_dir_relative_path}/*.whl)"
+    "$(echo "${wheel_dir_relative_path}"/*.whl)"