pandas-dev · rhshadrach · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1317,6 +1317,22 @@ def nullable_string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        pytest.param("str[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+    ]
+)
+def pyarrow_string_dtype(request):
+    """
+    Parametrized fixture for string dtypes backed by Pyarrow.
+
+    * 'str[pyarrow]'
+    * 'string[pyarrow]'
+    """
+    return request.param
+
+
 @pytest.fixture(
     params=[
         "python",

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -41,6 +41,7 @@
     is_list_like,
     is_numeric_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
@@ -1619,6 +1620,9 @@ def _accumulate(
         ------
         NotImplementedError : subclass does not define accumulations
         """
+        if is_string_dtype(self):
+            return self._str_accumulate(name=name, skipna=skipna, **kwargs)
+
         pyarrow_name = {
             "cummax": "cumulative_max",
             "cummin": "cumulative_min",
@@ -1654,6 +1658,64 @@ def _accumulate(
 
         return type(self)(result)
 
+    def _str_accumulate(
+        self, name: str, *, skipna: bool = True, **kwargs
+    ) -> ArrowExtensionArray | ExtensionArray:
+        """
+        Accumulate implementation for strings, see `_accumulate` docstring for details.
+
+        pyarrow.compute does not implement these methods for strings.
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # We may need to strip out leading / trailing NA values
+        head: pa.array | None = None
+        tail: pa.array | None = None
+        pa_array = self._pa_array
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        if self._hasna:
+            if skipna:
+                if name == "cumsum":
+                    pa_array = pc.fill_null(pa_array, "")
+                else:
+                    # After the first non-NA value we can retain the running min/max
+                    # by forward filling.
+                    pa_array = pc.fill_null_forward(pa_array)
+                    # But any leading NA values should result in "".
+                    nulls = pc.is_null(pa_array)
+                    idx = pc.index(nulls, False).as_py()
+                    if idx == -1:
+                        idx = len(pa_array)
+                    if idx > 0:
+                        head = pa.array([""] * idx, type=pa_array.type)
+                        pa_array = pa_array[idx:].combine_chunks()
+            else:
+                # When not skipping NA values, the result should be null from
+                # the first NA value onward.
+                nulls = pc.is_null(pa_array)
+                idx = pc.index(nulls, True).as_py()
+                tail = pa.nulls(len(pa_array) - idx, type=pa_array.type)
+                pa_array = pa_array[:idx].combine_chunks()
+
+        # error: Cannot call function of unknown type
+        pa_result = pa.array(np_func(pa_array), type=pa_array.type)  # type: ignore[operator]
+
+        assert head is None or tail is None
+        if head is not None:
+            pa_result = pa.concat_arrays([head, pa_result])
+        elif tail is not None:
+            pa_result = pa.concat_arrays([pa_result, tail])
+
+        result = type(self)(pa_result)
+        return result
+
     def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar:
         """
         Return a pyarrow scalar result of performing the reduction operation.

diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -4,7 +4,10 @@
 import numpy as np
 import pytest
 
-from pandas.compat import WASM
+from pandas.compat import (
+    HAS_PYARROW,
+    WASM,
+)
 
 from pandas.core.dtypes.common import is_number
 
@@ -163,10 +166,10 @@ def test_agg_cython_table_transform_series(request, series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
-    if series.dtype == "string" and func == "cumsum":
+    if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW:
         request.applymarker(
             pytest.mark.xfail(
-                raises=(TypeError, NotImplementedError),
+                raises=NotImplementedError,
                 reason="TODO(infer_string) cumsum not yet implemented for string",
             )
         )

diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py
@@ -18,8 +18,9 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
     def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
         try:
             alt = ser.astype("float64")
-        except TypeError:
-            # e.g. Period can't be cast to float64
+        except (TypeError, ValueError):
+            # e.g. Period can't be cast to float64 (TypeError)
+            #      String can't be cast to float64 (ValueError)
             alt = ser.astype(object)
 
         result = getattr(ser, op_name)(skipna=skipna)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -393,13 +393,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
         # attribute "pyarrow_dtype"
         pa_type = ser.dtype.pyarrow_dtype  # type: ignore[union-attr]
 
-        if (
-            pa.types.is_string(pa_type)
-            or pa.types.is_binary(pa_type)
-            or pa.types.is_decimal(pa_type)
-        ):
+        if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type):
             if op_name in ["cumsum", "cumprod", "cummax", "cummin"]:
                 return False
+        elif pa.types.is_string(pa_type):
+            if op_name == "cumprod":
+                return False
         elif pa.types.is_boolean(pa_type):
             if op_name in ["cumprod", "cummax", "cummin"]:
                 return False
@@ -414,6 +413,12 @@ def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
     def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request):
         pa_type = data.dtype.pyarrow_dtype
         op_name = all_numeric_accumulations
+
+        if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]:
+            # https://github.com/pandas-dev/pandas/pull/60633
+            # Doesn't fit test structure, tested in series/test_cumulative.py instead.
+            return
+
         ser = pd.Series(data)
 
         if not self._supports_accumulation(ser, op_name):

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -24,6 +24,8 @@
 
 from pandas.compat import HAS_PYARROW
 
+from pandas.core.dtypes.base import StorageExtensionDtype
+
 import pandas as pd
 import pandas._testing as tm
 from pandas.api.types import is_string_dtype
@@ -192,6 +194,14 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
             and op_name in ("any", "all")
         )
 
+    def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        assert isinstance(ser.dtype, StorageExtensionDtype)
+        return ser.dtype.storage == "pyarrow" and op_name in [
+            "cummin",
+            "cummax",
+            "cumsum",
+        ]
+
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
         if op_name in ["__add__", "__radd__"]:

diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
@@ -6,6 +6,8 @@
 tests.frame.test_cumulative
 """
 
+import re
+
 import numpy as np
 import pytest
 
@@ -227,3 +229,55 @@ def test_cumprod_timedelta(self):
         ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)])
         with pytest.raises(TypeError, match="cumprod not supported for Timedelta"):
             ser.cumprod()
+
+    @pytest.mark.parametrize(
+        "data, op, skipna, expected_data",
+        [
+            ([], "cumsum", True, []),
+            ([], "cumsum", False, []),
+            (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]),
+            (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]),
+            (["x", pd.NA, "y"], "cumsum", True, ["x", "x", "xy"]),
+            (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cumsum", True, ["", "x", "xy"]),
+            ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
+            ([], "cummin", True, []),
+            ([], "cummin", False, []),
+            (["y", "z", "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", "z", "x"], "cummin", False, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]),
+            ([pd.NA, "y", "x"], "cummin", True, ["", "y", "x"]),
+            ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([], "cummax", True, []),
+            ([], "cummax", False, []),
+            (["x", "z", "y"], "cummax", True, ["x", "z", "z"]),
+            (["x", "z", "y"], "cummax", False, ["x", "z", "z"]),
+            (["x", pd.NA, "y"], "cummax", True, ["x", "x", "y"]),
+            (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cummax", True, ["", "x", "y"]),
+            ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", True, ["", "", ""]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+        ],
+    )
+    def test_cum_methods_pyarrow_strings(
+        self, pyarrow_string_dtype, data, op, skipna, expected_data
+    ):
+        # https://github.com/pandas-dev/pandas/pull/60633
+        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        method = getattr(ser, op)
+        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        result = method(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
+    def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna):
+        # https://github.com/pandas-dev/pandas/pull/60633
+        ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype)
+        msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'")
+        with pytest.raises(TypeError, match=msg):
+            ser.cumprod(skipna=skipna)