pyarrow + categorical

topper-123 · topper-123 · commit a1e338a770f8 · 2023-04-24T22:45:00.000+01:00
diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
@@ -119,14 +119,13 @@ def _minmax(
             # min/max with empty array raise in numpy, pandas returns NA
             return libmissing.NA
         else:
-            return func(values)
+            return func(values, axis=axis)
     else:
         subset = values[~mask]
-        if subset.size:
-            return func(subset)
-        else:
+        if not subset.size:
             # min/max with empty array raise in numpy, pandas returns NA
             return libmissing.NA
+        return func(values, where=~mask, axis=axis, initial=subset[0])
 
 
 def min(
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1376,8 +1376,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
     def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs):
         """Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray."""
-        result = self._reduce(name, skipna=skipna, **kwargs)
-        result = pa.array([result])
+        result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
+        result = pa.array([result.as_py()], type=result.type)
         return type(self)(result)
 
     def __setitem__(self, key, value) -> None:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2082,6 +2082,10 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
     # ------------------------------------------------------------------
     # Reductions
 
+    def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs):
+        result = self._reduce(name, skipna=skipna, **kwargs)
+        return type(self)([result], dtype=self.dtype)
+
     def min(self, *, skipna: bool = True, **kwargs):
         """
         The minimum value of the object.
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -57,6 +57,7 @@
     notna,
 )
 
+import pandas as pd
 from pandas.core import (
     algorithms as algos,
     arraylike,
@@ -1078,29 +1079,48 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
 
         # median, skew, kurt, sem
         op = getattr(nanops, f"nan{name}")
-        result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
-
+        axis = kwargs.pop("axis", None)
+        result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
         if np.isnan(result):
-            return libmissing.NA
+            result = libmissing.NA
 
-        return result
+        return self._wrap_reduction_result(
+            name, result, skipna=skipna, axis=axis, **kwargs
+        )
 
     def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs):
-        res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs)
+        res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, axis=0, **kwargs)
         return res
 
     def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
+        axis = kwargs["axis"]
         if isinstance(result, np.ndarray):
-            axis = kwargs["axis"]
             if skipna:
                 # we only retain mask for all-NA rows/columns
                 mask = self._mask.all(axis=axis)
             else:
                 mask = self._mask.any(axis=axis)
 
             return self._maybe_mask_result(result, mask)
+        elif result is pd.NA and self.ndim == 2:
+            result = self._wrap_na_result(name=name, axis=axis)
+            return result
         return result
 
+    def _wrap_na_result(self, *, name, axis):
+        mask_size = self.shape[1] if axis == 0 else self.shape[0]
+        mask = np.ones(mask_size, dtype=bool)
+
+        if name in ["mean", "median", "var", "std", "skew"]:
+            np_dtype = "float64"
+        elif name in ["min", "max"]:
+            np_dtype = self.dtype.type
+        else:
+            np_dtype = {"i": "int64", "u": "uint64", "f": "float64"}[self.dtype.kind]
+
+        value = np.array([1], dtype=np_dtype)
+        return self._maybe_mask_result(value, mask=mask)
+
     def _wrap_min_count_reduction_result(
         self, name: str, result, skipna, min_count, **kwargs
     ):
@@ -1193,21 +1213,27 @@ def std(
 
     def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_min((), kwargs)
-        return masked_reductions.min(
+        result = masked_reductions.min(
             self._data,
             self._mask,
             skipna=skipna,
             axis=axis,
         )
+        return self._wrap_reduction_result(
+            "min", result, skipna=skipna, axis=axis, **kwargs
+        )
 
     def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
         nv.validate_max((), kwargs)
-        return masked_reductions.max(
+        result = masked_reductions.max(
             self._data,
             self._mask,
             skipna=skipna,
             axis=axis,
         )
+        return self._wrap_reduction_result(
+            "max", result, skipna=skipna, axis=axis, **kwargs
+        )
 
     def any(self, *, skipna: bool = True, **kwargs):
         """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -11177,7 +11177,7 @@ def idxmin(
         # indices will always be np.ndarray since axis is not None and
         # values is a 2d array for DataFrame
         # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
-        assert isinstance(indices, np.ndarray)  # for mypy
+        assert isinstance(indices, (np.ndarray, ExtensionArray))  # for mypy
 
         index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
@@ -11202,7 +11202,7 @@ def idxmax(
         # indices will always be np.ndarray since axis is not None and
         # values is a 2d array for DataFrame
         # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
-        assert isinstance(indices, np.ndarray)  # for mypy
+        assert isinstance(indices, (np.ndarray, ExtensionArray))  # for mypy
 
         index = data._get_axis(axis)
         result = [index[i] if i >= 0 else np.nan for i in indices]
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
@@ -56,6 +56,18 @@ def test_min_max_ordered(self, index_or_series_or_array):
         assert np.minimum.reduce(obj) == "d"
         assert np.maximum.reduce(obj) == "a"
 
+    def test_min_max_reduce_with_wrap(self):
+        # GH52788
+        cat = Categorical(["a", "b", "c", "d"], ordered=True)
+
+        result_max = cat._reduce_with_wrap("max", kwargs={})
+        expected_max = Categorical(["d"], dtype=cat.dtype)
+        tm.assert_categorical_equal(result_max, expected_max)
+
+        result_min = cat._reduce_with_wrap("min", kwargs={})
+        expected_min = Categorical(["a"], dtype=cat.dtype)
+        tm.assert_categorical_equal(result_min, expected_min)
+
     @pytest.mark.parametrize(
         "categories,expected",
         [
diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py
@@ -42,4 +42,3 @@ def test_dataframe_reductions(op):
     df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
     result = getattr(df, op)()
     assert isinstance(result["a"], np.int64)
-
diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py
@@ -4,6 +4,7 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.api.types import is_numeric_dtype
 from pandas.tests.extension.base.base import BaseExtensionTests
 
 
@@ -66,6 +67,15 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna):
             warnings.simplefilter("ignore", RuntimeWarning)
             self.check_reduce(s, op_name, skipna)
 
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_with_wrap(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
+        s = pd.Series(data)
+        if not is_numeric_dtype(s):
+            pytest.skip("not numeric dtype")
+
+        self.check_reduce_with_wrap(s, op_name, skipna)
+
 
 class BaseBooleanReduceTests(BaseReduceTests):
     @pytest.mark.parametrize("skipna", [True, False])
diff --git a/pandas/tests/extension/masked_shared.py b/pandas/tests/extension/masked_shared.py
@@ -64,6 +64,27 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
                 expected = pd.NA
         tm.assert_almost_equal(result, expected)
 
+    def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool):
+        if op_name in ["count", "kurt", "sem"]:
+            pytest.skip(f"{op_name} not an array method")
+
+        arr = ser.array
+
+        if op_name in ["mean", "median", "var", "std", "skew"]:
+            cmp_dtype = "Float64"
+        elif op_name in ["max", "min"]:
+            cmp_dtype = arr.dtype
+        else:
+            cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind]
+
+        result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={})
+        if not skipna and ser.isna().any():
+            expected = pd.array([pd.NA], dtype=cmp_dtype)
+        else:
+            exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)()
+            expected = pd.array([exp_value], dtype=cmp_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
 
 class Accumulation(base.BaseAccumulateTests):
     @pytest.mark.parametrize("skipna", [True, False])
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -505,6 +505,38 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
             request.node.add_marker(xfail_mark)
         super().test_reduce_series(data, all_numeric_reductions, skipna)
 
+    def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool):
+        if op_name in ["count", "kurt", "sem", "skew"]:
+            pytest.skip(f"{op_name} not an array method")
+
+        arr = ser.array
+        kwargs = {"ddof": 1} if op_name in ["var", "std"] else {}
+
+        if op_name in ["max", "min"]:
+            cmp_dtype = arr.dtype
+        elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
+            if op_name not in ["median", "var", "std"]:
+                cmp_dtype = arr.dtype
+            else:
+                cmp_dtype = "float64[pyarrow]"
+        elif op_name in ["median", "var", "std", "mean", "skew"]:
+            cmp_dtype = "float64[pyarrow]"
+        else:
+            cmp_dtype = {
+                "i": "int64[pyarrow]",
+                "u": "uint64[pyarrow]",
+                "f": "float64[pyarrow]",
+            }[arr.dtype.kind]
+        result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs=kwargs)
+
+        if not skipna and ser.isna().any():
+            expected = pd.array([pd.NA], dtype=cmp_dtype)
+        else:
+            exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)(**kwargs)
+            expected = pd.array([exp_value], dtype=cmp_dtype)
+
+        tm.assert_extension_array_equal(result, expected)
+
     @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
     def test_median_not_approximate(self, typ):
         # GH 52679
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1778,7 +1778,9 @@ def test_minmax_extensionarray(method, numeric_only):
     df = DataFrame({"Int64": ser})
     result = getattr(df, method)(numeric_only=numeric_only)
     expected = Series(
-        [getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
+        [getattr(int64_info, method)],
+        dtype="Int64",
+        index=Index(["Int64"], dtype="object"),
     )
     tm.assert_series_equal(result, expected)