Skip to content

Commit

Permalink
[Data] Relax type check in add_column (#48918)
Browse files Browse the repository at this point in the history
Previously, you could add a column with a list like this:
```
ds.add_column("zeros", lambda batch: [0] * len(batch))
```

However, after #48140, this
behavior isn't supported.

To avoid breaking tests and user code, this PR re-adds support for
lists.

---------

Signed-off-by: Balaji Veeramani <[email protected]>
  • Loading branch information
bveeramani authored Nov 25, 2024
1 parent 61b033d commit 2514aff
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 18 deletions.
20 changes: 9 additions & 11 deletions python/ray/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import time
import warnings
from collections.abc import Sequence
from typing import (
TYPE_CHECKING,
Any,
Expand Down Expand Up @@ -764,19 +765,20 @@ def add_column(
f"got: {batch_format}"
)

def _raise_duplicate_column_error(col: str):
raise ValueError(f"Trying to add an existing column with name {col!r}")

def add_column(batch: DataBatch) -> DataBatch:
column = fn(batch)
if batch_format == "pandas":
import pandas as pd

assert isinstance(column, pd.Series), (
assert isinstance(column, (pd.Series, Sequence)), (
f"For pandas batch format, the function must return a pandas "
f"Series, got: {type(column)}"
f"Series or sequence, got: {type(column)}"
)
if col in batch:
raise ValueError(
f"Trying to add an existing column with name" f" {col}"
)
_raise_duplicate_column_error(col)
batch.loc[:, col] = column
return batch
elif batch_format == "pyarrow":
Expand All @@ -798,9 +800,7 @@ def add_column(batch: DataBatch) -> DataBatch:
# Append the column to the table
return batch.append_column(col, column)
else:
raise ValueError(
f"Trying to add an existing column with name {col}"
)
_raise_duplicate_column_error(col)

else:
# batch format is assumed to be numpy since we checked at the
Expand All @@ -810,9 +810,7 @@ def add_column(batch: DataBatch) -> DataBatch:
f"numpy.ndarray, got: {type(column)}"
)
if col in batch:
raise ValueError(
f"Trying to add an existing column with name" f" {col}"
)
_raise_duplicate_column_error(col)
batch[col] = column
return batch

Expand Down
16 changes: 9 additions & 7 deletions python/ray/data/tests/test_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def test_add_column(ray_start_regular_shared):
# Adding a column that is already there should result in an error
with pytest.raises(
ray.exceptions.UserCodeException,
match="Trying to add an existing column with name id",
match="Trying to add an existing column with name 'id'",
):
ds = ray.data.range(5).add_column(
"id", lambda x: pc.add(x["id"], 1), batch_format="pyarrow"
Expand All @@ -362,7 +362,7 @@ def test_add_column(ray_start_regular_shared):

# Adding a column in the wrong format should result in an error
with pytest.raises(
ray.exceptions.UserCodeException, match="For pyarrow batch " "format"
ray.exceptions.UserCodeException, match="For pyarrow batch format"
):
ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="pyarrow")
assert ds.take(2) == [{"id": 1}, {"id": 2}]
Expand All @@ -381,7 +381,7 @@ def test_add_column(ray_start_regular_shared):
# Adding a column that is already there should result in an error
with pytest.raises(
ray.exceptions.UserCodeException,
match="Trying to add an existing column with name id",
match="Trying to add an existing column with name 'id'",
):
ds = ray.data.range(5).add_column(
"id", lambda x: np.add(x["id"], 1), batch_format="numpy"
Expand All @@ -390,7 +390,7 @@ def test_add_column(ray_start_regular_shared):

# Adding a column in the wrong format should result in an error
with pytest.raises(
ray.exceptions.UserCodeException, match="For numpy batch " "format"
ray.exceptions.UserCodeException, match="For numpy batch format"
):
ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="numpy")
assert ds.take(2) == [{"id": 1}, {"id": 2}]
Expand All @@ -405,16 +405,18 @@ def test_add_column(ray_start_regular_shared):
# Adding a column that is already there should result in an error
with pytest.raises(
ray.exceptions.UserCodeException,
match="Trying to add an existing column with name id",
match="Trying to add an existing column with name 'id'",
):
ds = ray.data.range(5).add_column("id", lambda x: x["id"] + 1)
assert ds.take(2) == [{"id": 1}, {"id": 2}]

# Adding a column in the wrong format should result in an error
with pytest.raises(
ray.exceptions.UserCodeException, match="For pandas batch " "format"
ray.exceptions.UserCodeException, match="For pandas batch format"
):
ds = ray.data.range(5).add_column("id", lambda x: [1], batch_format="pandas")
ds = ray.data.range(5).add_column(
"id", lambda x: np.array([1]), batch_format="pandas"
)
assert ds.take(2) == [{"id": 1}, {"id": 2}]

with pytest.raises(ValueError):
Expand Down

0 comments on commit 2514aff

Please sign in to comment.