Skip to content

Commit b954874

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents b59831e + 4cf8c5f commit b954874

File tree

13 files changed

+105
-75
lines changed

13 files changed

+105
-75
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,7 @@ Numeric
954954
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
955955
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
956956
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
957+
- Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`)
957958
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
958959

959960
Conversion
@@ -1113,10 +1114,6 @@ Groupby/resample/rolling
11131114
- Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`)
11141115
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
11151116
- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
1116-
- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed
1117-
to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical
1118-
indices. In particular, the result index shape might change if a copy of the input would be returned.
1119-
The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`)
11201117
- Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`)
11211118
- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
11221119
- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
@@ -1162,7 +1159,7 @@ Sparse
11621159
- Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`)
11631160
- Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`)
11641161
- Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`)
1165-
- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
1162+
- Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
11661163
- The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`)
11671164
- Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`)
11681165
- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`)

pandas/_libs/reduction.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ def apply_frame_axis0(object frame, object f, object names,
366366
# Need to infer if low level index slider will cause segfaults
367367
require_slow_apply = i == 0 and piece is chunk
368368
try:
369-
if not piece.index.equals(chunk.index):
369+
if not piece.index is chunk.index:
370370
mutated = True
371371
except AttributeError:
372372
# `piece` might not have an index, could be e.g. an int

pandas/core/arrays/sparse/array.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -862,21 +862,26 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
862862
else:
863863
raise IndexError("cannot do a non-empty take from an empty axes.")
864864

865+
# sp_indexer may be -1 for two reasons
866+
# 1.) we took for an index of -1 (new)
867+
# 2.) we took a value that was self.fill_value (old)
865868
sp_indexer = self.sp_index.lookup_array(indices)
869+
new_fill_indices = indices == -1
870+
old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
866871

867-
if self.sp_index.npoints == 0:
872+
if self.sp_index.npoints == 0 and old_fill_indices.all():
873+
# We've looked up all valid points on an all-sparse array.
874+
taken = np.full(
875+
sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
876+
)
877+
878+
elif self.sp_index.npoints == 0:
868879
# Avoid taking from the empty self.sp_values
869880
_dtype = np.result_type(self.dtype.subtype, type(fill_value))
870881
taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
871882
else:
872883
taken = self.sp_values.take(sp_indexer)
873884

874-
# sp_indexer may be -1 for two reasons
875-
# 1.) we took for an index of -1 (new)
876-
# 2.) we took a value that was self.fill_value (old)
877-
new_fill_indices = indices == -1
878-
old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
879-
880885
# Fill in two steps.
881886
# Old fill values
882887
# New fill values

pandas/core/indexes/numeric.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -400,28 +400,6 @@ def _format_native_types(
400400
)
401401
return formatter.get_result_as_array()
402402

403-
def equals(self, other) -> bool:
404-
"""
405-
Determines if two Index objects contain the same elements.
406-
"""
407-
if self is other:
408-
return True
409-
410-
if not isinstance(other, Index):
411-
return False
412-
413-
# need to compare nans locations and make sure that they are the same
414-
# since nans don't compare equal this is a bit tricky
415-
try:
416-
if not isinstance(other, Float64Index):
417-
other = self._constructor(other)
418-
if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape:
419-
return False
420-
left, right = self._values, other._values
421-
return ((left == right) | (self._isnan & other._isnan)).all()
422-
except (TypeError, ValueError):
423-
return False
424-
425403
def __contains__(self, other: Any) -> bool:
426404
hash(other)
427405
if super().__contains__(other):

pandas/core/internals/blocks.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1636,10 +1636,7 @@ def _holder(self):
16361636
@property
16371637
def fill_value(self):
16381638
# Used in reindex_indexer
1639-
if is_sparse(self.values):
1640-
return self.values.dtype.fill_value
1641-
else:
1642-
return self.values.dtype.na_value
1639+
return self.values.dtype.na_value
16431640

16441641
@property
16451642
def _can_hold_na(self):

pandas/tests/arrays/sparse/test_array.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,11 @@ def test_take(self):
281281
exp = SparseArray(np.take(self.arr_data, [0, 1, 2]))
282282
tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp)
283283

284+
def test_take_all_empty(self):
285+
a = pd.array([0, 0], dtype=pd.SparseDtype("int64"))
286+
result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
287+
tm.assert_sp_array_equal(a, result)
288+
284289
def test_take_fill_value(self):
285290
data = np.array([1, np.nan, 0, 3, 0])
286291
sparse = SparseArray(data, fill_value=0)

pandas/tests/extension/base/getitem.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -399,31 +399,3 @@ def test_item(self, data):
399399

400400
with pytest.raises(ValueError, match=msg):
401401
s.item()
402-
403-
def test_boolean_mask_frame_fill_value(self, data):
404-
# https://github.com/pandas-dev/pandas/issues/27781
405-
df = pd.DataFrame({"A": data})
406-
407-
mask = np.random.choice([True, False], df.shape[0])
408-
result = pd.isna(df.iloc[mask]["A"])
409-
expected = pd.isna(df["A"].iloc[mask])
410-
self.assert_series_equal(result, expected)
411-
412-
mask = pd.Series(mask, index=df.index)
413-
result = pd.isna(df.loc[mask]["A"])
414-
expected = pd.isna(df["A"].loc[mask])
415-
self.assert_series_equal(result, expected)
416-
417-
def test_fancy_index_frame_fill_value(self, data):
418-
# https://github.com/pandas-dev/pandas/issues/29563
419-
df = pd.DataFrame({"A": data})
420-
421-
mask = np.random.choice(df.shape[0], df.shape[0])
422-
result = pd.isna(df.iloc[mask]["A"])
423-
expected = pd.isna(df["A"].iloc[mask])
424-
self.assert_series_equal(result, expected)
425-
426-
mask = pd.Series(mask, index=df.index)
427-
result = pd.isna(df.loc[mask]["A"])
428-
expected = pd.isna(df["A"].loc[mask])
429-
self.assert_series_equal(result, expected)

pandas/tests/extension/test_sparse.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,6 @@ def data_for_twos(request):
4141
return SparseArray(np.ones(100) * 2)
4242

4343

44-
@pytest.fixture(params=[0, np.nan])
45-
def data_zeros(request):
46-
return SparseArray(np.zeros(100, dtype=int), fill_value=request.param)
47-
48-
4944
@pytest.fixture(params=[0, np.nan])
5045
def data_missing(request):
5146
"""Length 2 array with [NA, Valid]"""

pandas/tests/frame/indexing/test_sparse.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,23 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype):
4949
result = df.loc[itr_idx].dtypes.values
5050
expected = np.full(cols, SparseDtype(dtype, fill_value=0))
5151
tm.assert_numpy_array_equal(result, expected)
52+
53+
def test_reindex(self):
54+
# https://github.com/pandas-dev/pandas/issues/35286
55+
df = pd.DataFrame(
56+
{"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))}
57+
)
58+
result = df.reindex([0, 2])
59+
expected = pd.DataFrame(
60+
{
61+
"A": [0.0, np.nan],
62+
"B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)),
63+
},
64+
index=[0, 2],
65+
)
66+
tm.assert_frame_equal(result, expected)
67+
68+
def test_all_sparse(self):
69+
df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))})
70+
result = df.loc[[0, 1]]
71+
tm.assert_frame_equal(result, df)

pandas/tests/groupby/test_apply.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ def test_group_apply_once_per_group2(capsys):
211211
assert result == expected
212212

213213

214+
@pytest.mark.xfail(reason="GH-34998")
214215
def test_apply_fast_slow_identical():
215216
# GH 31613
216217

@@ -234,9 +235,11 @@ def fast(group):
234235
"func",
235236
[
236237
lambda x: x,
237-
lambda x: x[:],
238+
pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")),
238239
lambda x: x.copy(deep=False),
239-
lambda x: x.copy(deep=True),
240+
pytest.param(
241+
lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998")
242+
),
240243
],
241244
)
242245
def test_groupby_apply_identity_maybecopy_index_identical(func):
@@ -997,6 +1000,7 @@ def test_apply_function_with_indexing_return_column():
9971000
tm.assert_frame_equal(result, expected)
9981001

9991002

1003+
@pytest.mark.xfail(reason="GH-34998")
10001004
def test_apply_with_timezones_aware():
10011005
# GH: 27212
10021006

pandas/tests/groupby/test_function.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,24 @@ def test_max_min_non_numeric():
8585
assert "ss" in result
8686

8787

88+
def test_min_date_with_nans():
89+
# GH26321
90+
dates = pd.to_datetime(
91+
pd.Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
92+
).dt.date
93+
df = pd.DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
94+
95+
result = df.groupby("b", as_index=False)["c"].min()["c"]
96+
expected = pd.to_datetime(
97+
pd.Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
98+
).dt.date
99+
tm.assert_series_equal(result, expected)
100+
101+
result = df.groupby("b")["c"].min()
102+
expected.index.name = "b"
103+
tm.assert_series_equal(result, expected)
104+
105+
88106
def test_intercept_builtin_sum():
89107
s = Series([1.0, 2.0, np.nan, 3.0])
90108
grouped = s.groupby([0, 1, 2, 2])

pandas/tests/indexes/test_numeric.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,19 @@ def test_equals_numeric(self):
239239
i2 = Float64Index([1.0, np.nan])
240240
assert i.equals(i2)
241241

242+
@pytest.mark.parametrize(
243+
"other",
244+
(
245+
Int64Index([1, 2]),
246+
Index([1.0, 2.0], dtype=object),
247+
Index([1, 2], dtype=object),
248+
),
249+
)
250+
def test_equals_numeric_other_index_type(self, other):
251+
i = Float64Index([1.0, 2.0])
252+
assert i.equals(other)
253+
assert other.equals(i)
254+
242255
@pytest.mark.parametrize(
243256
"vals",
244257
[
@@ -635,3 +648,27 @@ def test_uint_index_does_not_convert_to_float64():
635648
tm.assert_index_equal(result.index, expected)
636649

637650
tm.assert_equal(result, series[:3])
651+
652+
653+
def test_float64_index_equals():
654+
# https://github.com/pandas-dev/pandas/issues/35217
655+
float_index = pd.Index([1.0, 2, 3])
656+
string_index = pd.Index(["1", "2", "3"])
657+
658+
result = float_index.equals(string_index)
659+
assert result is False
660+
661+
result = string_index.equals(float_index)
662+
assert result is False
663+
664+
665+
def test_float64_index_difference():
666+
# https://github.com/pandas-dev/pandas/issues/35217
667+
float_index = pd.Index([1.0, 2, 3])
668+
string_index = pd.Index(["1", "2", "3"])
669+
670+
result = float_index.difference(string_index)
671+
tm.assert_index_equal(result, float_index)
672+
673+
result = string_index.difference(float_index)
674+
tm.assert_index_equal(result, string_index)

pandas/tests/window/test_rolling.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pandas.util._test_decorators as td
88

99
import pandas as pd
10-
from pandas import DataFrame, Series, date_range
10+
from pandas import DataFrame, Series, compat, date_range
1111
import pandas._testing as tm
1212
from pandas.core.window import Rolling
1313

@@ -150,6 +150,7 @@ def test_closed_one_entry(func):
150150

151151

152152
@pytest.mark.parametrize("func", ["min", "max"])
153+
@pytest.mark.xfail(not compat.IS64, reason="GH-35294")
153154
def test_closed_one_entry_groupby(func):
154155
# GH24718
155156
ser = pd.DataFrame(
@@ -682,6 +683,7 @@ def test_iter_rolling_datetime(expected, expected_index, window):
682683
),
683684
],
684685
)
686+
@pytest.mark.xfail(not compat.IS64, reason="GH-35294")
685687
def test_rolling_positional_argument(grouping, _index, raw):
686688
# GH 34605
687689

0 commit comments

Comments
 (0)