merge with master

arw2019 · arw2019 · commit ac0a7f1a86c9 · 2020-07-16T23:50:49.000Z
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -922,6 +922,7 @@ Datetimelike
   resolution which converted to object dtype instead of coercing to ``datetime64[ns]``
   dtype when within the timestamp bounds (:issue:`34843`).
 - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`)
+- Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of datetime64[ns, tz] dtype (:issue:`35038`)
 - ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`)
 
 Timedelta
@@ -953,6 +954,7 @@ Numeric
 - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`)
 - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
 - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
+- Bug in arithmetic operations between ``DataFrame`` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`)
 - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
 - Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`)
 - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
@@ -1118,6 +1120,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
 - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
 - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
+- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raises ``TypeError`` for non-numeric types rather than dropping columns (:issue:`27892`)
 - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
 - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
 
diff --git a/environment.yml b/environment.yml
@@ -3,7 +3,8 @@ channels:
   - conda-forge
 dependencies:
   # required
-  - numpy>=1.15
+  # Pin numpy<1.19 until MPL 3.3.0 is released.
+  - numpy>=1.15,<1.19.0
   - python=3
   - python-dateutil>=2.7.3
   - pytz
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1120,7 +1120,7 @@ def _concat_same_type(
     # of objects
     _can_hold_na = True
 
-    def _reduce(self, name, skipna=True, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
         """
         Return a scalar result of performing the reduction operation.
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2076,11 +2076,11 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
         return result
 
     # reduction ops #
-    def _reduce(self, name, axis=0, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
         func = getattr(self, name, None)
         if func is None:
             raise TypeError(f"Categorical cannot perform the operation {name}")
-        return func(**kwargs)
+        return func(skipna=skipna, **kwargs)
 
     @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
     def min(self, skipna=True, **kwargs):
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1552,7 +1552,7 @@ def __isub__(self, other):
     # --------------------------------------------------------------
     # Reductions
 
-    def _reduce(self, name, axis=0, skipna=True, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
         op = getattr(self, name, None)
         if op:
             return op(skipna=skipna, **kwargs)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1164,7 +1164,7 @@ def nonzero(self):
     # Reductions
     # ------------------------------------------------------------------------
 
-    def _reduce(self, name, skipna=True, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
         method = getattr(self, name, None)
 
         if method is None:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -291,7 +291,7 @@ def astype(self, dtype, copy=True):
 
         return super().astype(dtype, copy)
 
-    def _reduce(self, name, skipna=True, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
         if name in ["min", "max"]:
             return getattr(self, name)(skipna=skipna)
 
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -152,11 +152,11 @@ def is_nonempty(x) -> bool:
             target_dtype = find_common_type([x.dtype for x in to_concat])
             to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
 
-        if isinstance(to_concat[0], ExtensionArray):
+        if isinstance(to_concat[0], ExtensionArray) and axis == 0:
             cls = type(to_concat[0])
             return cls._concat_same_type(to_concat)
         else:
-            return np.concatenate(to_concat)
+            return np.concatenate(to_concat, axis=axis)
 
     elif _contains_datetime or "timedelta" in typs:
         return concat_datetime(to_concat, axis=axis, typs=typs)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2403,7 +2403,7 @@ def _get_cythonized_result(
             signature
         needs_2d : bool, default False
             Whether the values and result of the Cython call signature
-            are at least 2-dimensional.
+            are 2-dimensional.
         min_count : int, default None
             When not None, min_count for the Cython call
         needs_mask : bool, default False
@@ -2419,7 +2419,9 @@ def _get_cythonized_result(
             Function should return a tuple where the first element is the
             values to be passed to Cython and the second element is an optional
             type which the values should be converted to after being returned
-            by the Cython operation. Raises if `needs_values` is False.
+            by the Cython operation. This function is also responsible for
+            raising a TypeError if the values have an invalid type. Raises
+            if `needs_values` is False.
         post_processing : function, default None
             Function to be applied to result of Cython function. Should accept
             an array of values as the first argument and type inferences as its
@@ -2451,6 +2453,7 @@ def _get_cythonized_result(
         output: Dict[base.OutputKey, np.ndarray] = {}
         base_func = getattr(libgroupby, how)
 
+        error_msg = ""
         for idx, obj in enumerate(self._iterate_slices()):
             name = obj.name
             values = obj._values
@@ -2477,7 +2480,11 @@ def _get_cythonized_result(
             if needs_values:
                 vals = values
                 if pre_processing:
-                    vals, inferences = pre_processing(vals)
+                    try:
+                        vals, inferences = pre_processing(vals)
+                    except TypeError as e:
+                        error_msg = str(e)
+                        continue
                 if needs_2d:
                     vals = vals.reshape((-1, 1))
                 vals = vals.astype(cython_dtype, copy=False)
@@ -2509,6 +2516,10 @@ def _get_cythonized_result(
             key = base.OutputKey(label=name, position=idx)
             output[key] = result
 
+        # error_msg is "" on an frame/series with no rows or columns
+        if len(output) == 0 and error_msg != "":
+            raise TypeError(error_msg)
+
         if aggregate:
             return self._wrap_aggregated_output(output)
         else:
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
@@ -333,7 +333,7 @@ def _concatenate_join_units(join_units, concat_axis, copy):
         # concatting with at least one EA means we are concatting a single column
         # the non-EA values are 2D arrays with shape (1, n)
         to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
-        concat_values = concat_compat(to_concat, axis=concat_axis)
+        concat_values = concat_compat(to_concat, axis=0)
         if not isinstance(concat_values, ExtensionArray):
             # if the result of concat is not an EA but an ndarray, reshape to
             # 2D to put it a non-EA Block
diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
@@ -17,6 +17,7 @@
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
 from pandas.core.dtypes.missing import isna
 
+from pandas.core import algorithms
 from pandas.core.construction import extract_array
 from pandas.core.ops.array_ops import (
     arithmetic_op,
@@ -562,18 +563,32 @@ def _frame_arith_method_with_reindex(
     DataFrame
     """
     # GH#31623, only operate on shared columns
-    cols = left.columns.intersection(right.columns)
+    cols, lcols, rcols = left.columns.join(
+        right.columns, how="inner", level=None, return_indexers=True
+    )
 
-    new_left = left[cols]
-    new_right = right[cols]
+    new_left = left.iloc[:, lcols]
+    new_right = right.iloc[:, rcols]
     result = op(new_left, new_right)
 
     # Do the join on the columns instead of using _align_method_FRAME
     #  to avoid constructing two potentially large/sparse DataFrames
     join_columns, _, _ = left.columns.join(
         right.columns, how="outer", level=None, return_indexers=True
     )
-    return result.reindex(join_columns, axis=1)
+
+    if result.columns.has_duplicates:
+        # Avoid reindexing with a duplicate axis.
+        # https://github.com/pandas-dev/pandas/issues/35194
+        indexer, _ = result.columns.get_indexer_non_unique(join_columns)
+        indexer = algorithms.unique1d(indexer)
+        result = result._reindex_with_indexers(
+            {1: [join_columns, indexer]}, allow_dups=True
+        )
+    else:
+        result = result.reindex(join_columns, axis=1)
+
+    return result
 
 
 def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int):
diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py
@@ -162,14 +162,14 @@ def _concat_same_type(cls, to_concat):
     def __invert__(self):
         return type(self).from_scalars(~self._data.to_pandas())
 
-    def _reduce(self, method, skipna=True, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
         if skipna:
             arr = self[~self.isna()]
         else:
             arr = self
 
         try:
-            op = getattr(arr, method)
+            op = getattr(arr, name)
         except AttributeError as err:
             raise TypeError from err
         return op(**kwargs)
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -174,7 +174,7 @@ def _formatter(self, boxed=False):
     def _concat_same_type(cls, to_concat):
         return cls(np.concatenate([x._data for x in to_concat]))
 
-    def _reduce(self, name, skipna=True, **kwargs):
+    def _reduce(self, name: str, skipna: bool = True, **kwargs):
 
         if skipna:
             # If we don't have any NAs, we can ignore skipna
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
@@ -1493,3 +1493,83 @@ def test_replace_period_ignore_float(self):
         result = df.replace(1.0, 0.0)
         expected = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3})
         tm.assert_frame_equal(expected, result)
+
+    def test_replace_value_category_type(self):
+        """
+        Test for #23305: to ensure category dtypes are maintained
+        after replace with direct values
+        """
+
+        # create input data
+        input_dict = {
+            "col1": [1, 2, 3, 4],
+            "col2": ["a", "b", "c", "d"],
+            "col3": [1.5, 2.5, 3.5, 4.5],
+            "col4": ["cat1", "cat2", "cat3", "cat4"],
+            "col5": ["obj1", "obj2", "obj3", "obj4"],
+        }
+        # explicitly cast columns as category and order them
+        input_df = pd.DataFrame(data=input_dict).astype(
+            {"col2": "category", "col4": "category"}
+        )
+        input_df["col2"] = input_df["col2"].cat.reorder_categories(
+            ["a", "b", "c", "d"], ordered=True
+        )
+        input_df["col4"] = input_df["col4"].cat.reorder_categories(
+            ["cat1", "cat2", "cat3", "cat4"], ordered=True
+        )
+
+        # create expected dataframe
+        expected_dict = {
+            "col1": [1, 2, 3, 4],
+            "col2": ["a", "b", "c", "z"],
+            "col3": [1.5, 2.5, 3.5, 4.5],
+            "col4": ["cat1", "catX", "cat3", "cat4"],
+            "col5": ["obj9", "obj2", "obj3", "obj4"],
+        }
+        # explicitly cast columns as category and order them
+        expected = pd.DataFrame(data=expected_dict).astype(
+            {"col2": "category", "col4": "category"}
+        )
+        expected["col2"] = expected["col2"].cat.reorder_categories(
+            ["a", "b", "c", "z"], ordered=True
+        )
+        expected["col4"] = expected["col4"].cat.reorder_categories(
+            ["cat1", "catX", "cat3", "cat4"], ordered=True
+        )
+
+        # replace values in input dataframe
+        input_df = input_df.replace("d", "z")
+        input_df = input_df.replace("obj1", "obj9")
+        result = input_df.replace("cat2", "catX")
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.xfail(
+        reason="category dtype gets changed to object type after replace, see #35268",
+        strict=True,
+    )
+    def test_replace_dict_category_type(self, input_category_df, expected_category_df):
+        """
+        Test to ensure category dtypes are maintained
+        after replace with dict values
+        """
+
+        # create input dataframe
+        input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]}
+        # explicitly cast columns as category
+        input_df = pd.DataFrame(data=input_dict).astype(
+            {"col1": "category", "col2": "category", "col3": "category"}
+        )
+
+        # create expected dataframe
+        expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]}
+        # explicitly cast columns as category
+        expected = pd.DataFrame(data=expected_dict).astype(
+            {"col1": "category", "col2": "category", "col3": "category"}
+        )
+
+        # replace values in input dataframe using a dict
+        result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"})
+
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -1552,3 +1552,12 @@ def test_dataframe_operation_with_non_numeric_types(df, col_dtype):
     expected = expected.astype({"b": col_dtype})
     result = df + pd.Series([-1.0], index=list("a"))
     tm.assert_frame_equal(result, expected)
+
+
+def test_arith_reindex_with_duplicates():
+    # https://github.com/pandas-dev/pandas/issues/35194
+    df1 = pd.DataFrame(data=[[0]], columns=["second"])
+    df2 = pd.DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"])
+    result = df1 + df2
+    expected = pd.DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"])
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py
@@ -232,3 +232,11 @@ def test_groupby_quantile_nullable_array(values, q):
 
     expected = pd.Series(true_quantiles * 2, index=idx, name="b")
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
+def test_groupby_quantile_skips_invalid_dtype(q):
+    df = pd.DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
+    result = df.groupby("a").quantile(q)
+    expected = df.groupby("a")[["b"]].quantile(q)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -1087,20 +1087,27 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self):
         date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
         s = Series({"date": date, "a": 1.0, "b": 2.0})
         df = DataFrame(columns=["c", "d"])
-        result = df.append(s, ignore_index=True)
-        # n.b. it's not clear to me that expected is correct here.
-        # It's possible that the `date` column should have
-        # datetime64[ns, tz] dtype for both result and expected.
-        # that would be more consistent with new columns having
-        # their own dtype (float for a and b, datetime64ns, tz for date).
+        result_a = df.append(s, ignore_index=True)
         expected = DataFrame(
-            [[np.nan, np.nan, 1.0, 2.0, date]],
-            columns=["c", "d", "a", "b", "date"],
-            dtype=object,
+            [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
         )
         # These columns get cast to object after append
-        expected["a"] = expected["a"].astype(float)
-        expected["b"] = expected["b"].astype(float)
+        expected["c"] = expected["c"].astype(object)
+        expected["d"] = expected["d"].astype(object)
+        tm.assert_frame_equal(result_a, expected)
+
+        expected = DataFrame(
+            [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
+        )
+        expected["c"] = expected["c"].astype(object)
+        expected["d"] = expected["d"].astype(object)
+
+        result_b = result_a.append(s, ignore_index=True)
+        tm.assert_frame_equal(result_b, expected)
+
+        # column order is different
+        expected = expected[["c", "d", "date", "a", "b"]]
+        result = df.append([s, s], ignore_index=True)
         tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py
diff --git a/requirements-dev.txt b/requirements-dev.txt