Merge remote-tracking branch 'upstream/master'

arw2019 · arw2019 · commit b954874fec74 · 2020-07-16T17:19:21.000Z
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -954,6 +954,7 @@ Numeric
 - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
 - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
 - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
+- Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`)
 - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
 
 Conversion
@@ -1113,10 +1114,6 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`)
 - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
 - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
-- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed
-  to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical
-  indices. In particular, the result index shape might change if a copy of the input would be returned.
-  The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`)
 - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`)
 - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
 - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
@@ -1162,7 +1159,7 @@ Sparse
 - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`)
 - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`)
 - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`)
-- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
+- Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`)
 - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s  string representation (:issue:`34352`)
 - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`)
 - Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`)
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -366,7 +366,7 @@ def apply_frame_axis0(object frame, object f, object names,
             # Need to infer if low level index slider will cause segfaults
             require_slow_apply = i == 0 and piece is chunk
             try:
-                if not piece.index.equals(chunk.index):
+                if not piece.index is chunk.index:
                     mutated = True
             except AttributeError:
                 # `piece` might not have an index, could be e.g. an int
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -862,21 +862,26 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
             else:
                 raise IndexError("cannot do a non-empty take from an empty axes.")
 
+        # sp_indexer may be -1 for two reasons
+        # 1.) we took for an index of -1 (new)
+        # 2.) we took a value that was self.fill_value (old)
         sp_indexer = self.sp_index.lookup_array(indices)
+        new_fill_indices = indices == -1
+        old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
 
-        if self.sp_index.npoints == 0:
+        if self.sp_index.npoints == 0 and old_fill_indices.all():
+            # We've looked up all valid points on an all-sparse array.
+            taken = np.full(
+                sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
+            )
+
+        elif self.sp_index.npoints == 0:
             # Avoid taking from the empty self.sp_values
             _dtype = np.result_type(self.dtype.subtype, type(fill_value))
             taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
         else:
             taken = self.sp_values.take(sp_indexer)
 
-            # sp_indexer may be -1 for two reasons
-            # 1.) we took for an index of -1 (new)
-            # 2.) we took a value that was self.fill_value (old)
-            new_fill_indices = indices == -1
-            old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
-
             # Fill in two steps.
             # Old fill values
             # New fill values
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
@@ -400,28 +400,6 @@ def _format_native_types(
         )
         return formatter.get_result_as_array()
 
-    def equals(self, other) -> bool:
-        """
-        Determines if two Index objects contain the same elements.
-        """
-        if self is other:
-            return True
-
-        if not isinstance(other, Index):
-            return False
-
-        # need to compare nans locations and make sure that they are the same
-        # since nans don't compare equal this is a bit tricky
-        try:
-            if not isinstance(other, Float64Index):
-                other = self._constructor(other)
-            if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape:
-                return False
-            left, right = self._values, other._values
-            return ((left == right) | (self._isnan & other._isnan)).all()
-        except (TypeError, ValueError):
-            return False
-
     def __contains__(self, other: Any) -> bool:
         hash(other)
         if super().__contains__(other):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1636,10 +1636,7 @@ def _holder(self):
     @property
     def fill_value(self):
         # Used in reindex_indexer
-        if is_sparse(self.values):
-            return self.values.dtype.fill_value
-        else:
-            return self.values.dtype.na_value
+        return self.values.dtype.na_value
 
     @property
     def _can_hold_na(self):
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -281,6 +281,11 @@ def test_take(self):
         exp = SparseArray(np.take(self.arr_data, [0, 1, 2]))
         tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp)
 
+    def test_take_all_empty(self):
+        a = pd.array([0, 0], dtype=pd.SparseDtype("int64"))
+        result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
+        tm.assert_sp_array_equal(a, result)
+
     def test_take_fill_value(self):
         data = np.array([1, np.nan, 0, 3, 0])
         sparse = SparseArray(data, fill_value=0)
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
@@ -399,31 +399,3 @@ def test_item(self, data):
 
         with pytest.raises(ValueError, match=msg):
             s.item()
-
-    def test_boolean_mask_frame_fill_value(self, data):
-        # https://github.com/pandas-dev/pandas/issues/27781
-        df = pd.DataFrame({"A": data})
-
-        mask = np.random.choice([True, False], df.shape[0])
-        result = pd.isna(df.iloc[mask]["A"])
-        expected = pd.isna(df["A"].iloc[mask])
-        self.assert_series_equal(result, expected)
-
-        mask = pd.Series(mask, index=df.index)
-        result = pd.isna(df.loc[mask]["A"])
-        expected = pd.isna(df["A"].loc[mask])
-        self.assert_series_equal(result, expected)
-
-    def test_fancy_index_frame_fill_value(self, data):
-        # https://github.com/pandas-dev/pandas/issues/29563
-        df = pd.DataFrame({"A": data})
-
-        mask = np.random.choice(df.shape[0], df.shape[0])
-        result = pd.isna(df.iloc[mask]["A"])
-        expected = pd.isna(df["A"].iloc[mask])
-        self.assert_series_equal(result, expected)
-
-        mask = pd.Series(mask, index=df.index)
-        result = pd.isna(df.loc[mask]["A"])
-        expected = pd.isna(df["A"].loc[mask])
-        self.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -41,11 +41,6 @@ def data_for_twos(request):
     return SparseArray(np.ones(100) * 2)
 
 
-@pytest.fixture(params=[0, np.nan])
-def data_zeros(request):
-    return SparseArray(np.zeros(100, dtype=int), fill_value=request.param)
-
-
 @pytest.fixture(params=[0, np.nan])
 def data_missing(request):
     """Length 2 array with [NA, Valid]"""
diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py
@@ -49,3 +49,23 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype):
         result = df.loc[itr_idx].dtypes.values
         expected = np.full(cols, SparseDtype(dtype, fill_value=0))
         tm.assert_numpy_array_equal(result, expected)
+
+    def test_reindex(self):
+        # https://github.com/pandas-dev/pandas/issues/35286
+        df = pd.DataFrame(
+            {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))}
+        )
+        result = df.reindex([0, 2])
+        expected = pd.DataFrame(
+            {
+                "A": [0.0, np.nan],
+                "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)),
+            },
+            index=[0, 2],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_all_sparse(self):
+        df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))})
+        result = df.loc[[0, 1]]
+        tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -211,6 +211,7 @@ def test_group_apply_once_per_group2(capsys):
     assert result == expected
 
 
+@pytest.mark.xfail(reason="GH-34998")
 def test_apply_fast_slow_identical():
     # GH 31613
 
@@ -234,9 +235,11 @@ def fast(group):
     "func",
     [
         lambda x: x,
-        lambda x: x[:],
+        pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")),
         lambda x: x.copy(deep=False),
-        lambda x: x.copy(deep=True),
+        pytest.param(
+            lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998")
+        ),
     ],
 )
 def test_groupby_apply_identity_maybecopy_index_identical(func):
@@ -997,6 +1000,7 @@ def test_apply_function_with_indexing_return_column():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(reason="GH-34998")
 def test_apply_with_timezones_aware():
     # GH: 27212
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -85,6 +85,24 @@ def test_max_min_non_numeric():
     assert "ss" in result
 
 
+def test_min_date_with_nans():
+    # GH26321
+    dates = pd.to_datetime(
+        pd.Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
+    ).dt.date
+    df = pd.DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
+
+    result = df.groupby("b", as_index=False)["c"].min()["c"]
+    expected = pd.to_datetime(
+        pd.Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
+    ).dt.date
+    tm.assert_series_equal(result, expected)
+
+    result = df.groupby("b")["c"].min()
+    expected.index.name = "b"
+    tm.assert_series_equal(result, expected)
+
+
 def test_intercept_builtin_sum():
     s = Series([1.0, 2.0, np.nan, 3.0])
     grouped = s.groupby([0, 1, 2, 2])
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -239,6 +239,19 @@ def test_equals_numeric(self):
         i2 = Float64Index([1.0, np.nan])
         assert i.equals(i2)
 
+    @pytest.mark.parametrize(
+        "other",
+        (
+            Int64Index([1, 2]),
+            Index([1.0, 2.0], dtype=object),
+            Index([1, 2], dtype=object),
+        ),
+    )
+    def test_equals_numeric_other_index_type(self, other):
+        i = Float64Index([1.0, 2.0])
+        assert i.equals(other)
+        assert other.equals(i)
+
     @pytest.mark.parametrize(
         "vals",
         [
@@ -635,3 +648,27 @@ def test_uint_index_does_not_convert_to_float64():
     tm.assert_index_equal(result.index, expected)
 
     tm.assert_equal(result, series[:3])
+
+
+def test_float64_index_equals():
+    # https://github.com/pandas-dev/pandas/issues/35217
+    float_index = pd.Index([1.0, 2, 3])
+    string_index = pd.Index(["1", "2", "3"])
+
+    result = float_index.equals(string_index)
+    assert result is False
+
+    result = string_index.equals(float_index)
+    assert result is False
+
+
+def test_float64_index_difference():
+    # https://github.com/pandas-dev/pandas/issues/35217
+    float_index = pd.Index([1.0, 2, 3])
+    string_index = pd.Index(["1", "2", "3"])
+
+    result = float_index.difference(string_index)
+    tm.assert_index_equal(result, float_index)
+
+    result = string_index.difference(float_index)
+    tm.assert_index_equal(result, string_index)
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -7,7 +7,7 @@
 import pandas.util._test_decorators as td
 
 import pandas as pd
-from pandas import DataFrame, Series, date_range
+from pandas import DataFrame, Series, compat, date_range
 import pandas._testing as tm
 from pandas.core.window import Rolling
 
@@ -150,6 +150,7 @@ def test_closed_one_entry(func):
 
 
 @pytest.mark.parametrize("func", ["min", "max"])
+@pytest.mark.xfail(not compat.IS64, reason="GH-35294")
 def test_closed_one_entry_groupby(func):
     # GH24718
     ser = pd.DataFrame(
@@ -682,6 +683,7 @@ def test_iter_rolling_datetime(expected, expected_index, window):
         ),
     ],
 )
+@pytest.mark.xfail(not compat.IS64, reason="GH-35294")
 def test_rolling_positional_argument(grouping, _index, raw):
     # GH 34605