pandas-dev · WillAyd · Dec 30, 2024 · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/pandas/__init__.py b/pandas/__init__.py
@@ -61,6 +61,7 @@
     PeriodDtype,
     IntervalDtype,
     DatetimeTZDtype,
+    ListDtype,
     StringDtype,
     BooleanDtype,
     # missing
@@ -261,6 +262,7 @@
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
+    "ListDtype",
     "MultiIndex",
     "NaT",
     "NamedAgg",

@@ -834,6 +834,8 @@ cpdef ndarray[object] ensure_string_array(
             if isinstance(val, bytes):
                 # GH#49658 discussion of desired behavior here
                 result[i] = val.decode()
+            elif util.is_array(val):
+                result[i] = str(val.tolist())
             elif not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
                 result[i] = f"{val}"

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -54,6 +54,7 @@
     TimedeltaArray,
 )
 from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import safe_sort_index
 
@@ -824,6 +825,11 @@ def assert_extension_array_equal(
             [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
         ), "wrong missing value sentinels"
 
+    # TODO: not every array type may be convertible to NumPy; should catch here
+    if isinstance(left.dtype, ListDtype) and isinstance(right.dtype, ListDtype):
+        assert left._pa_array == right._pa_array
+        return
+
     left_valid = left[~left_na].to_numpy(dtype=object)
     right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:

diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -40,6 +40,7 @@
     UInt32Dtype,
     UInt64Dtype,
 )
+from pandas.core.arrays.list_ import ListDtype
 from pandas.core.arrays.string_ import StringDtype
 from pandas.core.construction import array  # noqa: ICN001
 from pandas.core.flags import Flags
@@ -103,6 +104,7 @@
     "Interval",
     "IntervalDtype",
     "IntervalIndex",
+    "ListDtype",
     "MultiIndex",
     "NaT",
     "NamedAgg",

diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
@@ -18,6 +18,8 @@
 
 from pandas.core.dtypes.common import is_list_like
 
+from pandas.core.arrays.list_ import ListDtype
+
 if not pa_version_under10p1:
     import pyarrow as pa
     import pyarrow.compute as pc
@@ -106,7 +108,7 @@ def len(self) -> Series:
         ...         [1, 2, 3],
         ...         [3],
         ...     ],
-        ...     dtype=pd.ArrowDtype(pa.list_(pa.int64())),
+        ...     dtype=pd.ListDtype(pa.int64()),
         ... )
         >>> s.list.len()
         0    3
@@ -189,7 +191,7 @@ def __getitem__(self, key: int | slice) -> Series:
             sliced = pc.list_slice(self._pa_array, start, stop, step)
             return Series(
                 sliced,
-                dtype=ArrowDtype(sliced.type),
+                dtype=ListDtype(sliced.type.value_type),
                 index=self._data.index,
                 name=self._data.name,
             )

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -428,7 +428,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
         """
         if isinstance(value, pa.Scalar):
             pa_scalar = value
-        elif isna(value):
+        elif not is_list_like(value) and isna(value):
             pa_scalar = pa.scalar(None, type=pa_type)
         else:
             # Workaround https://github.com/apache/arrow/issues/37291
@@ -1350,7 +1350,16 @@ def take(
                 # TODO(ARROW-9433): Treat negative indices as NULL
                 indices_array = pa.array(indices_array, mask=fill_mask)
                 result = self._pa_array.take(indices_array)
-                if isna(fill_value):
+                if is_list_like(fill_value):
+                    # TODO: this should be hit by ListArray. Ideally we do:
+                    # pc.replace_with_mask(result, fill_mask, pa.scalar(fill_value))
+                    # but pyarrow does not yet implement that for list types
+                    new_values = [
+                        fill_value if should_fill else x.as_py()
+                        for x, should_fill in zip(result, fill_mask)
+                    ]
+                    return type(self)(new_values)
+                elif isna(fill_value):
                     return type(self)(result)
                 # TODO: ArrowNotImplementedError: Function fill_null has no
                 # kernel matching input types (array[string], scalar[string])