Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement first-class List type #60629

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
PeriodDtype,
IntervalDtype,
DatetimeTZDtype,
ListDtype,
StringDtype,
BooleanDtype,
# missing
Expand Down Expand Up @@ -261,6 +262,7 @@
"Interval",
"IntervalDtype",
"IntervalIndex",
"ListDtype",
"MultiIndex",
"NaT",
"NamedAgg",
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,8 @@ cpdef ndarray[object] ensure_string_array(
if isinstance(val, bytes):
# GH#49658 discussion of desired behavior here
result[i] = val.decode()
elif util.is_array(val):
result[i] = str(val.tolist())
elif not util.is_float_object(val):
# f"{val}" is faster than str(val)
result[i] = f"{val}"
Expand Down
6 changes: 6 additions & 0 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
TimedeltaArray,
)
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
from pandas.core.arrays.list_ import ListDtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.indexes.api import safe_sort_index

Expand Down Expand Up @@ -824,6 +825,11 @@ def assert_extension_array_equal(
[np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined]
), "wrong missing value sentinels"

# TODO: not every array type may be convertible to NumPy; should catch here
if isinstance(left.dtype, ListDtype) and isinstance(right.dtype, ListDtype):
assert left._pa_array == right._pa_array
return

left_valid = left[~left_na].to_numpy(dtype=object)
right_valid = right[~right_na].to_numpy(dtype=object)
if check_exact:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
UInt32Dtype,
UInt64Dtype,
)
from pandas.core.arrays.list_ import ListDtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import array # noqa: ICN001
from pandas.core.flags import Flags
Expand Down Expand Up @@ -103,6 +104,7 @@
"Interval",
"IntervalDtype",
"IntervalIndex",
"ListDtype",
"MultiIndex",
"NaT",
"NamedAgg",
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/arrays/arrow/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

from pandas.core.dtypes.common import is_list_like

from pandas.core.arrays.list_ import ListDtype

if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
Expand Down Expand Up @@ -106,7 +108,7 @@ def len(self) -> Series:
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(pa.int64())),
... dtype=pd.ListDtype(pa.int64()),
... )
>>> s.list.len()
0 3
Expand Down Expand Up @@ -189,7 +191,7 @@ def __getitem__(self, key: int | slice) -> Series:
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(
sliced,
dtype=ArrowDtype(sliced.type),
dtype=ListDtype(sliced.type.value_type),
index=self._data.index,
name=self._data.name,
)
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""
if isinstance(value, pa.Scalar):
pa_scalar = value
elif isna(value):
elif not is_list_like(value) and isna(value):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment why this is necessary. e.g. off the top of my head I dont know if pa.ListScalar subclasses pa.Scalar

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this particular case the isna(value) check fails when value is a list, since it doesn't return a boolean back.

The "not is_list_like" was a quick way to prevent this branch from throwing an exception, but open to better ways of expressing that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As to your question, pa.ListScalar does inherit from pa.Scalar (all Scalars in pyarrow do) but that is not the type that is hitting this branch, since it is caught in the one preceding

pa_scalar = pa.scalar(None, type=pa_type)
else:
# Workaround https://github.com/apache/arrow/issues/37291
Expand Down Expand Up @@ -1350,7 +1350,16 @@ def take(
# TODO(ARROW-9433): Treat negative indices as NULL
indices_array = pa.array(indices_array, mask=fill_mask)
result = self._pa_array.take(indices_array)
if isna(fill_value):
if is_list_like(fill_value):
# TODO: this should be hit by ListArray. Ideally we do:
# pc.replace_with_mask(result, fill_mask, pa.scalar(fill_value))
# but pyarrow does not yet implement that for list types
new_values = [
fill_value if should_fill else x.as_py()
for x, should_fill in zip(result, fill_mask)
]
return type(self)(new_values)
elif isna(fill_value):
return type(self)(result)
# TODO: ArrowNotImplementedError: Function fill_null has no
# kernel matching input types (array[string], scalar[string])
Expand Down
Loading
Loading