Skip to content

Commit a1e338a

Browse files
committed
pyarrow + categorical
1 parent 475ccd4 commit a1e338a

File tree

11 files changed

+123
-18
lines changed

11 files changed

+123
-18
lines changed

pandas/core/array_algos/masked_reductions.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,13 @@ def _minmax(
119119
# min/max with empty array raise in numpy, pandas returns NA
120120
return libmissing.NA
121121
else:
122-
return func(values)
122+
return func(values, axis=axis)
123123
else:
124124
subset = values[~mask]
125-
if subset.size:
126-
return func(subset)
127-
else:
125+
if not subset.size:
128126
# min/max with empty array raise in numpy, pandas returns NA
129127
return libmissing.NA
128+
return func(values, where=~mask, axis=axis, initial=subset[0])
130129

131130

132131
def min(

pandas/core/arrays/arrow/array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,8 +1376,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
13761376

13771377
def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs):
13781378
"""Takes the result of ``_reduce`` and wraps it an a ndarray/extensionArray."""
1379-
result = self._reduce(name, skipna=skipna, **kwargs)
1380-
result = pa.array([result])
1379+
result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
1380+
result = pa.array([result.as_py()], type=result.type)
13811381
return type(self)(result)
13821382

13831383
def __setitem__(self, key, value) -> None:

pandas/core/arrays/categorical.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2082,6 +2082,10 @@ def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
20822082
# ------------------------------------------------------------------
20832083
# Reductions
20842084

2085+
def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs):
2086+
result = self._reduce(name, skipna=skipna, **kwargs)
2087+
return type(self)([result], dtype=self.dtype)
2088+
20852089
def min(self, *, skipna: bool = True, **kwargs):
20862090
"""
20872091
The minimum value of the object.

pandas/core/arrays/masked.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
notna,
5858
)
5959

60+
import pandas as pd
6061
from pandas.core import (
6162
algorithms as algos,
6263
arraylike,
@@ -1078,29 +1079,48 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
10781079

10791080
# median, skew, kurt, sem
10801081
op = getattr(nanops, f"nan{name}")
1081-
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
1082-
1082+
axis = kwargs.pop("axis", None)
1083+
result = op(data, axis=axis, skipna=skipna, mask=mask, **kwargs)
10831084
if np.isnan(result):
1084-
return libmissing.NA
1085+
result = libmissing.NA
10851086

1086-
return result
1087+
return self._wrap_reduction_result(
1088+
name, result, skipna=skipna, axis=axis, **kwargs
1089+
)
10871090

10881091
def _reduce_with_wrap(self, name: str, *, skipna: bool = True, kwargs):
1089-
res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, **kwargs)
1092+
res = self.reshape(-1, 1)._reduce(name=name, skipna=skipna, axis=0, **kwargs)
10901093
return res
10911094

10921095
def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
1096+
axis = kwargs["axis"]
10931097
if isinstance(result, np.ndarray):
1094-
axis = kwargs["axis"]
10951098
if skipna:
10961099
# we only retain mask for all-NA rows/columns
10971100
mask = self._mask.all(axis=axis)
10981101
else:
10991102
mask = self._mask.any(axis=axis)
11001103

11011104
return self._maybe_mask_result(result, mask)
1105+
elif result is pd.NA and self.ndim == 2:
1106+
result = self._wrap_na_result(name=name, axis=axis)
1107+
return result
11021108
return result
11031109

1110+
def _wrap_na_result(self, *, name, axis):
1111+
mask_size = self.shape[1] if axis == 0 else self.shape[0]
1112+
mask = np.ones(mask_size, dtype=bool)
1113+
1114+
if name in ["mean", "median", "var", "std", "skew"]:
1115+
np_dtype = "float64"
1116+
elif name in ["min", "max"]:
1117+
np_dtype = self.dtype.type
1118+
else:
1119+
np_dtype = {"i": "int64", "u": "uint64", "f": "float64"}[self.dtype.kind]
1120+
1121+
value = np.array([1], dtype=np_dtype)
1122+
return self._maybe_mask_result(value, mask=mask)
1123+
11041124
def _wrap_min_count_reduction_result(
11051125
self, name: str, result, skipna, min_count, **kwargs
11061126
):
@@ -1193,21 +1213,27 @@ def std(
11931213

11941214
def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
11951215
nv.validate_min((), kwargs)
1196-
return masked_reductions.min(
1216+
result = masked_reductions.min(
11971217
self._data,
11981218
self._mask,
11991219
skipna=skipna,
12001220
axis=axis,
12011221
)
1222+
return self._wrap_reduction_result(
1223+
"min", result, skipna=skipna, axis=axis, **kwargs
1224+
)
12021225

12031226
def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
12041227
nv.validate_max((), kwargs)
1205-
return masked_reductions.max(
1228+
result = masked_reductions.max(
12061229
self._data,
12071230
self._mask,
12081231
skipna=skipna,
12091232
axis=axis,
12101233
)
1234+
return self._wrap_reduction_result(
1235+
"max", result, skipna=skipna, axis=axis, **kwargs
1236+
)
12111237

12121238
def any(self, *, skipna: bool = True, **kwargs):
12131239
"""

pandas/core/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11177,7 +11177,7 @@ def idxmin(
1117711177
# indices will always be np.ndarray since axis is not None and
1117811178
# values is a 2d array for DataFrame
1117911179
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
11180-
assert isinstance(indices, np.ndarray) # for mypy
11180+
assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy
1118111181

1118211182
index = data._get_axis(axis)
1118311183
result = [index[i] if i >= 0 else np.nan for i in indices]
@@ -11202,7 +11202,7 @@ def idxmax(
1120211202
# indices will always be np.ndarray since axis is not None and
1120311203
# values is a 2d array for DataFrame
1120411204
# error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
11205-
assert isinstance(indices, np.ndarray) # for mypy
11205+
assert isinstance(indices, (np.ndarray, ExtensionArray)) # for mypy
1120611206

1120711207
index = data._get_axis(axis)
1120811208
result = [index[i] if i >= 0 else np.nan for i in indices]

pandas/tests/arrays/categorical/test_analytics.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,18 @@ def test_min_max_ordered(self, index_or_series_or_array):
5656
assert np.minimum.reduce(obj) == "d"
5757
assert np.maximum.reduce(obj) == "a"
5858

59+
def test_min_max_reduce_with_wrap(self):
60+
# GH52788
61+
cat = Categorical(["a", "b", "c", "d"], ordered=True)
62+
63+
result_max = cat._reduce_with_wrap("max", kwargs={})
64+
expected_max = Categorical(["d"], dtype=cat.dtype)
65+
tm.assert_categorical_equal(result_max, expected_max)
66+
67+
result_min = cat._reduce_with_wrap("min", kwargs={})
68+
expected_min = Categorical(["a"], dtype=cat.dtype)
69+
tm.assert_categorical_equal(result_min, expected_min)
70+
5971
@pytest.mark.parametrize(
6072
"categories,expected",
6173
[

pandas/tests/arrays/integer/test_reduction.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,3 @@ def test_dataframe_reductions(op):
4242
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
4343
result = getattr(df, op)()
4444
assert isinstance(result["a"], np.int64)
45-

pandas/tests/extension/base/reduce.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import pandas as pd
66
import pandas._testing as tm
7+
from pandas.api.types import is_numeric_dtype
78
from pandas.tests.extension.base.base import BaseExtensionTests
89

910

@@ -66,6 +67,15 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna):
6667
warnings.simplefilter("ignore", RuntimeWarning)
6768
self.check_reduce(s, op_name, skipna)
6869

70+
@pytest.mark.parametrize("skipna", [True, False])
71+
def test_reduce_with_wrap(self, data, all_numeric_reductions, skipna):
72+
op_name = all_numeric_reductions
73+
s = pd.Series(data)
74+
if not is_numeric_dtype(s):
75+
pytest.skip("not numeric dtype")
76+
77+
self.check_reduce_with_wrap(s, op_name, skipna)
78+
6979

7080
class BaseBooleanReduceTests(BaseReduceTests):
7181
@pytest.mark.parametrize("skipna", [True, False])

pandas/tests/extension/masked_shared.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,27 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
6464
expected = pd.NA
6565
tm.assert_almost_equal(result, expected)
6666

67+
def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool):
68+
if op_name in ["count", "kurt", "sem"]:
69+
pytest.skip(f"{op_name} not an array method")
70+
71+
arr = ser.array
72+
73+
if op_name in ["mean", "median", "var", "std", "skew"]:
74+
cmp_dtype = "Float64"
75+
elif op_name in ["max", "min"]:
76+
cmp_dtype = arr.dtype
77+
else:
78+
cmp_dtype = {"i": "Int64", "u": "UInt64", "f": "Float64"}[arr.dtype.kind]
79+
80+
result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs={})
81+
if not skipna and ser.isna().any():
82+
expected = pd.array([pd.NA], dtype=cmp_dtype)
83+
else:
84+
exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)()
85+
expected = pd.array([exp_value], dtype=cmp_dtype)
86+
tm.assert_extension_array_equal(result, expected)
87+
6788

6889
class Accumulation(base.BaseAccumulateTests):
6990
@pytest.mark.parametrize("skipna", [True, False])

pandas/tests/extension/test_arrow.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,38 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
505505
request.node.add_marker(xfail_mark)
506506
super().test_reduce_series(data, all_numeric_reductions, skipna)
507507

508+
def check_reduce_with_wrap(self, ser: pd.Series, op_name: str, skipna: bool):
509+
if op_name in ["count", "kurt", "sem", "skew"]:
510+
pytest.skip(f"{op_name} not an array method")
511+
512+
arr = ser.array
513+
kwargs = {"ddof": 1} if op_name in ["var", "std"] else {}
514+
515+
if op_name in ["max", "min"]:
516+
cmp_dtype = arr.dtype
517+
elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
518+
if op_name not in ["median", "var", "std"]:
519+
cmp_dtype = arr.dtype
520+
else:
521+
cmp_dtype = "float64[pyarrow]"
522+
elif op_name in ["median", "var", "std", "mean", "skew"]:
523+
cmp_dtype = "float64[pyarrow]"
524+
else:
525+
cmp_dtype = {
526+
"i": "int64[pyarrow]",
527+
"u": "uint64[pyarrow]",
528+
"f": "float64[pyarrow]",
529+
}[arr.dtype.kind]
530+
result = arr._reduce_with_wrap(op_name, skipna=skipna, kwargs=kwargs)
531+
532+
if not skipna and ser.isna().any():
533+
expected = pd.array([pd.NA], dtype=cmp_dtype)
534+
else:
535+
exp_value = getattr(ser.dropna().astype(cmp_dtype), op_name)(**kwargs)
536+
expected = pd.array([exp_value], dtype=cmp_dtype)
537+
538+
tm.assert_extension_array_equal(result, expected)
539+
508540
@pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
509541
def test_median_not_approximate(self, typ):
510542
# GH 52679

pandas/tests/frame/test_reductions.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1778,7 +1778,9 @@ def test_minmax_extensionarray(method, numeric_only):
17781778
df = DataFrame({"Int64": ser})
17791779
result = getattr(df, method)(numeric_only=numeric_only)
17801780
expected = Series(
1781-
[getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
1781+
[getattr(int64_info, method)],
1782+
dtype="Int64",
1783+
index=Index(["Int64"], dtype="object"),
17821784
)
17831785
tm.assert_series_equal(result, expected)
17841786

0 commit comments

Comments
 (0)