Skip to content

Commit ac0a7f1

Browse files
committed
merge with master
2 parents b954874 + 6509028 commit ac0a7f1

File tree

19 files changed

+166
-61
lines changed

19 files changed

+166
-61
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,7 @@ Datetimelike
922922
resolution which converted to object dtype instead of coercing to ``datetime64[ns]``
923923
dtype when within the timestamp bounds (:issue:`34843`).
924924
- The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`)
925+
- Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of datetime64[ns, tz] dtype (:issue:`35038`)
925926
- ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`)
926927

927928
Timedelta
@@ -953,6 +954,7 @@ Numeric
953954
- Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
954955
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
955956
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
957+
- Bug in arithmetic operations between ``DataFrame`` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`)
956958
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
957959
- Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`)
958960
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
@@ -1118,6 +1120,7 @@ Groupby/resample/rolling
11181120
- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
11191121
- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
11201122
- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
1123+
- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raises ``TypeError`` for non-numeric types rather than dropping columns (:issue:`27892`)
11211124
- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
11221125
- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
11231126

environment.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ channels:
33
- conda-forge
44
dependencies:
55
# required
6-
- numpy>=1.15
6+
# Pin numpy<1.19 until MPL 3.3.0 is released.
7+
- numpy>=1.15,<1.19.0
78
- python=3
89
- python-dateutil>=2.7.3
910
- pytz

pandas/core/arrays/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1120,7 +1120,7 @@ def _concat_same_type(
11201120
# of objects
11211121
_can_hold_na = True
11221122

1123-
def _reduce(self, name, skipna=True, **kwargs):
1123+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
11241124
"""
11251125
Return a scalar result of performing the reduction operation.
11261126

pandas/core/arrays/categorical.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2076,11 +2076,11 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
20762076
return result
20772077

20782078
# reduction ops #
2079-
def _reduce(self, name, axis=0, **kwargs):
2079+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
20802080
func = getattr(self, name, None)
20812081
if func is None:
20822082
raise TypeError(f"Categorical cannot perform the operation {name}")
2083-
return func(**kwargs)
2083+
return func(skipna=skipna, **kwargs)
20842084

20852085
@deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
20862086
def min(self, skipna=True, **kwargs):

pandas/core/arrays/datetimelike.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1552,7 +1552,7 @@ def __isub__(self, other):
15521552
# --------------------------------------------------------------
15531553
# Reductions
15541554

1555-
def _reduce(self, name, axis=0, skipna=True, **kwargs):
1555+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
15561556
op = getattr(self, name, None)
15571557
if op:
15581558
return op(skipna=skipna, **kwargs)

pandas/core/arrays/sparse/array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1164,7 +1164,7 @@ def nonzero(self):
11641164
# Reductions
11651165
# ------------------------------------------------------------------------
11661166

1167-
def _reduce(self, name, skipna=True, **kwargs):
1167+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
11681168
method = getattr(self, name, None)
11691169

11701170
if method is None:

pandas/core/arrays/string_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def astype(self, dtype, copy=True):
291291

292292
return super().astype(dtype, copy)
293293

294-
def _reduce(self, name, skipna=True, **kwargs):
294+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
295295
if name in ["min", "max"]:
296296
return getattr(self, name)(skipna=skipna)
297297

pandas/core/dtypes/concat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,11 @@ def is_nonempty(x) -> bool:
152152
target_dtype = find_common_type([x.dtype for x in to_concat])
153153
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
154154

155-
if isinstance(to_concat[0], ExtensionArray):
155+
if isinstance(to_concat[0], ExtensionArray) and axis == 0:
156156
cls = type(to_concat[0])
157157
return cls._concat_same_type(to_concat)
158158
else:
159-
return np.concatenate(to_concat)
159+
return np.concatenate(to_concat, axis=axis)
160160

161161
elif _contains_datetime or "timedelta" in typs:
162162
return concat_datetime(to_concat, axis=axis, typs=typs)

pandas/core/groupby/groupby.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2403,7 +2403,7 @@ def _get_cythonized_result(
24032403
signature
24042404
needs_2d : bool, default False
24052405
Whether the values and result of the Cython call signature
2406-
are at least 2-dimensional.
2406+
are 2-dimensional.
24072407
min_count : int, default None
24082408
When not None, min_count for the Cython call
24092409
needs_mask : bool, default False
@@ -2419,7 +2419,9 @@ def _get_cythonized_result(
24192419
Function should return a tuple where the first element is the
24202420
values to be passed to Cython and the second element is an optional
24212421
type which the values should be converted to after being returned
2422-
by the Cython operation. Raises if `needs_values` is False.
2422+
by the Cython operation. This function is also responsible for
2423+
raising a TypeError if the values have an invalid type. Raises
2424+
if `needs_values` is False.
24232425
post_processing : function, default None
24242426
Function to be applied to result of Cython function. Should accept
24252427
an array of values as the first argument and type inferences as its
@@ -2451,6 +2453,7 @@ def _get_cythonized_result(
24512453
output: Dict[base.OutputKey, np.ndarray] = {}
24522454
base_func = getattr(libgroupby, how)
24532455

2456+
error_msg = ""
24542457
for idx, obj in enumerate(self._iterate_slices()):
24552458
name = obj.name
24562459
values = obj._values
@@ -2477,7 +2480,11 @@ def _get_cythonized_result(
24772480
if needs_values:
24782481
vals = values
24792482
if pre_processing:
2480-
vals, inferences = pre_processing(vals)
2483+
try:
2484+
vals, inferences = pre_processing(vals)
2485+
except TypeError as e:
2486+
error_msg = str(e)
2487+
continue
24812488
if needs_2d:
24822489
vals = vals.reshape((-1, 1))
24832490
vals = vals.astype(cython_dtype, copy=False)
@@ -2509,6 +2516,10 @@ def _get_cythonized_result(
25092516
key = base.OutputKey(label=name, position=idx)
25102517
output[key] = result
25112518

2519+
# error_msg is "" on an frame/series with no rows or columns
2520+
if len(output) == 0 and error_msg != "":
2521+
raise TypeError(error_msg)
2522+
25122523
if aggregate:
25132524
return self._wrap_aggregated_output(output)
25142525
else:

pandas/core/internals/concat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ def _concatenate_join_units(join_units, concat_axis, copy):
333333
# concatting with at least one EA means we are concatting a single column
334334
# the non-EA values are 2D arrays with shape (1, n)
335335
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
336-
concat_values = concat_compat(to_concat, axis=concat_axis)
336+
concat_values = concat_compat(to_concat, axis=0)
337337
if not isinstance(concat_values, ExtensionArray):
338338
# if the result of concat is not an EA but an ndarray, reshape to
339339
# 2D to put it a non-EA Block

pandas/core/ops/__init__.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
1818
from pandas.core.dtypes.missing import isna
1919

20+
from pandas.core import algorithms
2021
from pandas.core.construction import extract_array
2122
from pandas.core.ops.array_ops import (
2223
arithmetic_op,
@@ -562,18 +563,32 @@ def _frame_arith_method_with_reindex(
562563
DataFrame
563564
"""
564565
# GH#31623, only operate on shared columns
565-
cols = left.columns.intersection(right.columns)
566+
cols, lcols, rcols = left.columns.join(
567+
right.columns, how="inner", level=None, return_indexers=True
568+
)
566569

567-
new_left = left[cols]
568-
new_right = right[cols]
570+
new_left = left.iloc[:, lcols]
571+
new_right = right.iloc[:, rcols]
569572
result = op(new_left, new_right)
570573

571574
# Do the join on the columns instead of using _align_method_FRAME
572575
# to avoid constructing two potentially large/sparse DataFrames
573576
join_columns, _, _ = left.columns.join(
574577
right.columns, how="outer", level=None, return_indexers=True
575578
)
576-
return result.reindex(join_columns, axis=1)
579+
580+
if result.columns.has_duplicates:
581+
# Avoid reindexing with a duplicate axis.
582+
# https://github.com/pandas-dev/pandas/issues/35194
583+
indexer, _ = result.columns.get_indexer_non_unique(join_columns)
584+
indexer = algorithms.unique1d(indexer)
585+
result = result._reindex_with_indexers(
586+
{1: [join_columns, indexer]}, allow_dups=True
587+
)
588+
else:
589+
result = result.reindex(join_columns, axis=1)
590+
591+
return result
577592

578593

579594
def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int):

pandas/tests/extension/arrow/arrays.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,14 @@ def _concat_same_type(cls, to_concat):
162162
def __invert__(self):
163163
return type(self).from_scalars(~self._data.to_pandas())
164164

165-
def _reduce(self, method, skipna=True, **kwargs):
165+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
166166
if skipna:
167167
arr = self[~self.isna()]
168168
else:
169169
arr = self
170170

171171
try:
172-
op = getattr(arr, method)
172+
op = getattr(arr, name)
173173
except AttributeError as err:
174174
raise TypeError from err
175175
return op(**kwargs)

pandas/tests/extension/decimal/array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _formatter(self, boxed=False):
174174
def _concat_same_type(cls, to_concat):
175175
return cls(np.concatenate([x._data for x in to_concat]))
176176

177-
def _reduce(self, name, skipna=True, **kwargs):
177+
def _reduce(self, name: str, skipna: bool = True, **kwargs):
178178

179179
if skipna:
180180
# If we don't have any NAs, we can ignore skipna

pandas/tests/frame/methods/test_replace.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1493,3 +1493,83 @@ def test_replace_period_ignore_float(self):
14931493
result = df.replace(1.0, 0.0)
14941494
expected = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3})
14951495
tm.assert_frame_equal(expected, result)
1496+
1497+
def test_replace_value_category_type(self):
1498+
"""
1499+
Test for #23305: to ensure category dtypes are maintained
1500+
after replace with direct values
1501+
"""
1502+
1503+
# create input data
1504+
input_dict = {
1505+
"col1": [1, 2, 3, 4],
1506+
"col2": ["a", "b", "c", "d"],
1507+
"col3": [1.5, 2.5, 3.5, 4.5],
1508+
"col4": ["cat1", "cat2", "cat3", "cat4"],
1509+
"col5": ["obj1", "obj2", "obj3", "obj4"],
1510+
}
1511+
# explicitly cast columns as category and order them
1512+
input_df = pd.DataFrame(data=input_dict).astype(
1513+
{"col2": "category", "col4": "category"}
1514+
)
1515+
input_df["col2"] = input_df["col2"].cat.reorder_categories(
1516+
["a", "b", "c", "d"], ordered=True
1517+
)
1518+
input_df["col4"] = input_df["col4"].cat.reorder_categories(
1519+
["cat1", "cat2", "cat3", "cat4"], ordered=True
1520+
)
1521+
1522+
# create expected dataframe
1523+
expected_dict = {
1524+
"col1": [1, 2, 3, 4],
1525+
"col2": ["a", "b", "c", "z"],
1526+
"col3": [1.5, 2.5, 3.5, 4.5],
1527+
"col4": ["cat1", "catX", "cat3", "cat4"],
1528+
"col5": ["obj9", "obj2", "obj3", "obj4"],
1529+
}
1530+
# explicitly cast columns as category and order them
1531+
expected = pd.DataFrame(data=expected_dict).astype(
1532+
{"col2": "category", "col4": "category"}
1533+
)
1534+
expected["col2"] = expected["col2"].cat.reorder_categories(
1535+
["a", "b", "c", "z"], ordered=True
1536+
)
1537+
expected["col4"] = expected["col4"].cat.reorder_categories(
1538+
["cat1", "catX", "cat3", "cat4"], ordered=True
1539+
)
1540+
1541+
# replace values in input dataframe
1542+
input_df = input_df.replace("d", "z")
1543+
input_df = input_df.replace("obj1", "obj9")
1544+
result = input_df.replace("cat2", "catX")
1545+
1546+
tm.assert_frame_equal(result, expected)
1547+
1548+
@pytest.mark.xfail(
1549+
reason="category dtype gets changed to object type after replace, see #35268",
1550+
strict=True,
1551+
)
1552+
def test_replace_dict_category_type(self, input_category_df, expected_category_df):
1553+
"""
1554+
Test to ensure category dtypes are maintained
1555+
after replace with dict values
1556+
"""
1557+
1558+
# create input dataframe
1559+
input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]}
1560+
# explicitly cast columns as category
1561+
input_df = pd.DataFrame(data=input_dict).astype(
1562+
{"col1": "category", "col2": "category", "col3": "category"}
1563+
)
1564+
1565+
# create expected dataframe
1566+
expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]}
1567+
# explicitly cast columns as category
1568+
expected = pd.DataFrame(data=expected_dict).astype(
1569+
{"col1": "category", "col2": "category", "col3": "category"}
1570+
)
1571+
1572+
# replace values in input dataframe using a dict
1573+
result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"})
1574+
1575+
tm.assert_frame_equal(result, expected)

pandas/tests/frame/test_arithmetic.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,3 +1552,12 @@ def test_dataframe_operation_with_non_numeric_types(df, col_dtype):
15521552
expected = expected.astype({"b": col_dtype})
15531553
result = df + pd.Series([-1.0], index=list("a"))
15541554
tm.assert_frame_equal(result, expected)
1555+
1556+
1557+
def test_arith_reindex_with_duplicates():
1558+
# https://github.com/pandas-dev/pandas/issues/35194
1559+
df1 = pd.DataFrame(data=[[0]], columns=["second"])
1560+
df2 = pd.DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"])
1561+
result = df1 + df2
1562+
expected = pd.DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"])
1563+
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_quantile.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,3 +232,11 @@ def test_groupby_quantile_nullable_array(values, q):
232232

233233
expected = pd.Series(true_quantiles * 2, index=idx, name="b")
234234
tm.assert_series_equal(result, expected)
235+
236+
237+
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
238+
def test_groupby_quantile_skips_invalid_dtype(q):
239+
df = pd.DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
240+
result = df.groupby("a").quantile(q)
241+
expected = df.groupby("a")[["b"]].quantile(q)
242+
tm.assert_frame_equal(result, expected)

pandas/tests/reshape/test_concat.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,20 +1087,27 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self):
10871087
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
10881088
s = Series({"date": date, "a": 1.0, "b": 2.0})
10891089
df = DataFrame(columns=["c", "d"])
1090-
result = df.append(s, ignore_index=True)
1091-
# n.b. it's not clear to me that expected is correct here.
1092-
# It's possible that the `date` column should have
1093-
# datetime64[ns, tz] dtype for both result and expected.
1094-
# that would be more consistent with new columns having
1095-
# their own dtype (float for a and b, datetime64ns, tz for date).
1090+
result_a = df.append(s, ignore_index=True)
10961091
expected = DataFrame(
1097-
[[np.nan, np.nan, 1.0, 2.0, date]],
1098-
columns=["c", "d", "a", "b", "date"],
1099-
dtype=object,
1092+
[[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
11001093
)
11011094
# These columns get cast to object after append
1102-
expected["a"] = expected["a"].astype(float)
1103-
expected["b"] = expected["b"].astype(float)
1095+
expected["c"] = expected["c"].astype(object)
1096+
expected["d"] = expected["d"].astype(object)
1097+
tm.assert_frame_equal(result_a, expected)
1098+
1099+
expected = DataFrame(
1100+
[[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
1101+
)
1102+
expected["c"] = expected["c"].astype(object)
1103+
expected["d"] = expected["d"].astype(object)
1104+
1105+
result_b = result_a.append(s, ignore_index=True)
1106+
tm.assert_frame_equal(result_b, expected)
1107+
1108+
# column order is different
1109+
expected = expected[["c", "d", "date", "a", "b"]]
1110+
result = df.append([s, s], ignore_index=True)
11041111
tm.assert_frame_equal(result, expected)
11051112

11061113

0 commit comments

Comments
 (0)