Skip to content

Commit

Permalink
Merge pull request #17868 from rapidsai/branch-25.02
Browse files Browse the repository at this point in the history
Forward-merge branch-25.02 into branch-25.04
  • Loading branch information
GPUtester authored Jan 29, 2025
2 parents 9937424 + 1417f36 commit f4ed519
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 65 deletions.
43 changes: 26 additions & 17 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,14 @@
from cudf.core.dtypes import IntervalDtype
from cudf.core.join._join_helpers import _match_join_keys
from cudf.core.mixins import BinaryOperand
from cudf.core.scalar import pa_scalar_to_plc_scalar
from cudf.core.single_column_frame import SingleColumnFrame
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
SIZE_TYPE_DTYPE,
_maybe_convert_to_default_type,
cudf_dtype_from_pa_type,
cudf_dtype_to_pa_type,
find_common_type,
is_mixed_with_object_dtype,
)
Expand Down Expand Up @@ -3346,50 +3349,56 @@ def interval_range(
"freq, exactly three must be specified"
)

start = cudf.Scalar(start) if start is not None else start
end = cudf.Scalar(end) if end is not None else end
if periods is not None and not cudf.api.types.is_integer(periods):
warnings.warn(
"Non-integer 'periods' in cudf.date_range, and cudf.interval_range"
" are deprecated and will raise in a future version.",
FutureWarning,
)
periods = cudf.Scalar(int(periods)) if periods is not None else periods
freq = cudf.Scalar(freq) if freq is not None else freq

if start is None:
start = end - freq * periods
elif freq is None:
quotient, remainder = divmod((end - start).value, periods.value)
quotient, remainder = divmod(end - start, periods)
if remainder:
freq = (end - start) / periods
else:
freq = cudf.Scalar(int(quotient))
freq = int(quotient)
elif periods is None:
periods = cudf.Scalar(int((end - start) / freq))
periods = int((end - start) / freq)
elif end is None:
end = start + periods * freq

pa_start = pa.scalar(start)
pa_end = pa.scalar(end)
pa_freq = pa.scalar(freq)

if any(
not _is_non_decimal_numeric_dtype(x.dtype)
for x in (start, periods, freq, end)
not _is_non_decimal_numeric_dtype(cudf_dtype_from_pa_type(x.type))
for x in (pa_start, pa.scalar(periods), pa_freq, pa_end)
):
raise ValueError("start, end, periods, freq must be numeric values.")

periods = periods.astype("int64")
common_dtype = find_common_type((start.dtype, freq.dtype, end.dtype))
start = start.astype(common_dtype)
freq = freq.astype(common_dtype)
common_dtype = find_common_type(
(
cudf_dtype_from_pa_type(pa_start.type),
cudf_dtype_from_pa_type(pa_freq.type),
cudf_dtype_from_pa_type(pa_end.type),
)
)
pa_start = pa_start.cast(cudf_dtype_to_pa_type(common_dtype))
pa_freq = pa_freq.cast(cudf_dtype_to_pa_type(common_dtype))

with acquire_spill_lock():
bin_edges = libcudf.column.Column.from_pylibcudf(
plc.filling.sequence(
size=periods + 1,
init=start.device_value,
step=freq.device_value,
init=pa_scalar_to_plc_scalar(pa_start),
step=pa_scalar_to_plc_scalar(pa_freq),
)
)
return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name)
return IntervalIndex.from_breaks(
bin_edges.astype(common_dtype), closed=closed, name=name
)


class IntervalIndex(Index):
Expand Down
71 changes: 23 additions & 48 deletions python/cudf/cudf/tests/indexes/test_interval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
import numpy as np
import pandas as pd
import pyarrow as pa
Expand Down Expand Up @@ -30,7 +30,6 @@ def test_interval_to_arrow():
np.int64,
np.float32,
np.float64,
cudf.Scalar,
]

PERIODS_TYPES = [
Expand All @@ -39,10 +38,23 @@ def test_interval_to_arrow():
np.int16,
np.int32,
np.int64,
cudf.Scalar,
]


def assert_with_pandas_2_bug(pindex, gindex):
# pandas upcasts to 64 bit https://github.com/pandas-dev/pandas/issues/57268
# using Series to use check_dtype
if gindex.dtype.subtype.kind == "f":
gindex = gindex.astype(
cudf.IntervalDtype(subtype="float64", closed=gindex.dtype.closed)
)
elif gindex.dtype.subtype.kind == "i":
gindex = gindex.astype(
cudf.IntervalDtype(subtype="int64", closed=gindex.dtype.closed)
)
assert_eq(pd.Series(pindex), cudf.Series(gindex), check_dtype=False)


@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
@pytest.mark.parametrize("start", [0, 1, 2, 3])
@pytest.mark.parametrize("end", [4, 5, 6, 7])
Expand All @@ -57,9 +69,7 @@ def test_interval_range_basic(start, end, closed):
@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
def test_interval_range_dtype_basic(start_t, end_t):
start, end = start_t(24), end_t(42)
start_val = start.value if isinstance(start, cudf.Scalar) else start
end_val = end.value if isinstance(end, cudf.Scalar) else end
pindex = pd.interval_range(start=start_val, end=end_val, closed="left")
pindex = pd.interval_range(start=start, end=end, closed="left")
gindex = cudf.interval_range(start=start, end=end, closed="left")

assert_eq(pindex, gindex)
Expand Down Expand Up @@ -91,27 +101,11 @@ def test_interval_range_freq_basic(start, end, freq, closed):
@pytest.mark.parametrize("freq_t", INTERVAL_BOUNDARY_TYPES)
def test_interval_range_freq_basic_dtype(start_t, end_t, freq_t):
start, end, freq = start_t(5), end_t(70), freq_t(3)
start_val = start.value if isinstance(start, cudf.Scalar) else start
end_val = end.value if isinstance(end, cudf.Scalar) else end
freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
pindex = pd.interval_range(
start=start_val, end=end_val, freq=freq_val, closed="left"
)
pindex = pd.interval_range(start=start, end=end, freq=freq, closed="left")
gindex = cudf.interval_range(
start=start, end=end, freq=freq, closed="left"
)
if gindex.dtype.subtype.kind == "f":
gindex = gindex.astype(
cudf.IntervalDtype(subtype="float64", closed=gindex.dtype.closed)
)
elif gindex.dtype.subtype.kind == "i":
gindex = gindex.astype(
cudf.IntervalDtype(subtype="int64", closed=gindex.dtype.closed)
)

# pandas upcasts to 64 bit https://github.com/pandas-dev/pandas/issues/57268
# using Series to use check_dtype
assert_eq(pd.Series(pindex), cudf.Series(gindex), check_dtype=False)
assert_with_pandas_2_bug(pindex, gindex)


@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
Expand All @@ -134,13 +128,8 @@ def test_interval_range_periods_basic(start, end, periods, closed):
@pytest.mark.parametrize("periods_t", PERIODS_TYPES)
def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
start, end, periods = start_t(0), end_t(4), periods_t(1)
start_val = start.value if isinstance(start, cudf.Scalar) else start
end_val = end.value if isinstance(end, cudf.Scalar) else end
periods_val = (
periods.value if isinstance(periods, cudf.Scalar) else periods
)
pindex = pd.interval_range(
start=start_val, end=end_val, periods=periods_val, closed="left"
start=start, end=end, periods=periods, closed="left"
)
gindex = cudf.interval_range(
start=start, end=end, periods=periods, closed="left"
Expand Down Expand Up @@ -188,19 +177,13 @@ def test_interval_range_periods_freq_end(end, freq, periods, closed):
@pytest.mark.parametrize("end_t", INTERVAL_BOUNDARY_TYPES)
def test_interval_range_periods_freq_end_dtype(periods_t, freq_t, end_t):
periods, freq, end = periods_t(2), freq_t(3), end_t(10)
freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
end_val = end.value if isinstance(end, cudf.Scalar) else end
periods_val = (
periods.value if isinstance(periods, cudf.Scalar) else periods
)
pindex = pd.interval_range(
end=end_val, freq=freq_val, periods=periods_val, closed="left"
end=end, freq=freq, periods=periods, closed="left"
)
gindex = cudf.interval_range(
end=end, freq=freq, periods=periods, closed="left"
)

assert_eq(pindex, gindex)
assert_with_pandas_2_bug(pindex, gindex)


@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
Expand All @@ -223,21 +206,13 @@ def test_interval_range_periods_freq_start(start, freq, periods, closed):
@pytest.mark.parametrize("start_t", INTERVAL_BOUNDARY_TYPES)
def test_interval_range_periods_freq_start_dtype(periods_t, freq_t, start_t):
periods, freq, start = periods_t(2), freq_t(3), start_t(9)
freq_val = freq.value if isinstance(freq, cudf.Scalar) else freq
start_val = start.value if isinstance(start, cudf.Scalar) else start
periods_val = (
periods.value if isinstance(periods, cudf.Scalar) else periods
)
pindex = pd.interval_range(
start=start_val, freq=freq_val, periods=periods_val, closed="left"
start=start, freq=freq, periods=periods, closed="left"
)
gindex = cudf.interval_range(
start=start, freq=freq, periods=periods, closed="left"
)

# pandas upcasts to 64 bit https://github.com/pandas-dev/pandas/issues/57268
# using Series to use check_dtype
assert_eq(pd.Series(pindex), cudf.Series(gindex), check_dtype=False)
assert_with_pandas_2_bug(pindex, gindex)


@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
Expand Down

0 comments on commit f4ed519

Please sign in to comment.