pandas-dev · rhshadrach · May 8, 2024 · May 12, 2024 · May 15, 2024 · Jun 2, 2024
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -642,8 +642,8 @@ def sanitize_array(
         data = list(data)
 
         if len(data) == 0 and dtype is None:
-            # We default to float64, matching numpy
-            subarr = np.array([], dtype=np.float64)
+            # We default to object, diverging from NumPy
+            subarr = np.array([], dtype=np.object_)
 
         elif dtype is not None:
             subarr = _try_cast(data, dtype, copy)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -82,6 +82,7 @@
     can_hold_element,
     construct_1d_arraylike_from_scalar,
     construct_2d_arraylike_from_scalar,
+    ensure_dtype_can_hold_na,
     find_common_type,
     infer_dtype_from_scalar,
     invalidate_string_dtypes,
@@ -13052,6 +13053,8 @@ def quantile(
         C        1 days 12:00:00
         Name: 0.5, dtype: object
         """
+        from pandas.core.dtypes.common import is_object_dtype
+
         validate_percentile(q)
         axis = self._get_axis_number(axis)
 
@@ -13066,23 +13069,25 @@ def quantile(
                 interpolation=interpolation,
                 method=method,
             )
-            if method == "single":
-                res = res_df.iloc[0]
-            else:
-                # cannot directly iloc over sparse arrays
-                res = res_df.T.iloc[:, 0]
+            res = res_df.iloc[0]
             if axis == 1 and len(self) == 0:
                 # GH#41544 try to get an appropriate dtype
-                dtype = find_common_type(list(self.dtypes))
-                if needs_i8_conversion(dtype):
-                    return res.astype(dtype)
+                dtype = "float64"
+                cdtype = find_common_type(list(self.dtypes))
+                if needs_i8_conversion(cdtype) or is_object_dtype(cdtype):
+                    dtype = cdtype
+                return res.astype(dtype)
             return res
 
         q = Index(q, dtype=np.float64)
         data = self._get_numeric_data() if numeric_only else self
 
         if axis == 1:
             data = data.T
+            if data.shape[0] == 0:
+                # The transpose has no rows, so the original has no columns, meaning we
+                # have no dtype information. Since this is quantile, default to float64
+                data = data.astype("float64")
 
         if len(data.columns) == 0:
             # GH#23925 _get_numeric_data may have dropped all columns
@@ -13092,7 +13097,7 @@ def quantile(
             if axis == 1:
                 # GH#41544 try to get an appropriate dtype
                 cdtype = find_common_type(list(self.dtypes))
-                if needs_i8_conversion(cdtype):
+                if needs_i8_conversion(cdtype) or is_object_dtype(cdtype):
                     dtype = cdtype
 
             res = self._constructor([], index=q, columns=cols, dtype=dtype)
@@ -13103,6 +13108,21 @@ def quantile(
             raise ValueError(
                 f"Invalid method: {method}. Method must be in {valid_method}."
             )
+
+        # handle degenerate case
+        if len(data) == 0:
+            from pandas import array
+
+            result = self._constructor(
+                {
+                    idx: array(len(q) * [np.nan], dtype=ensure_dtype_can_hold_na(dtype))
+                    for idx, dtype in enumerate(data.dtypes)
+                },
+                index=q,
+            )
+            result.columns = data.columns
+            return result
+
         if method == "single":
             res = data._mgr.quantile(qs=q, interpolation=interpolation)
         elif method == "table":
@@ -13112,13 +13132,6 @@ def quantile(
                     f"Invalid interpolation: {interpolation}. "
                     f"Interpolation must be in {valid_interpolation}"
                 )
-            # handle degenerate case
-            if len(data) == 0:
-                if data.ndim == 2:
-                    dtype = find_common_type(list(self.dtypes))
-                else:
-                    dtype = self.dtype
-                return self._constructor([], index=q, columns=data.columns, dtype=dtype)
 
             q_idx = np.quantile(np.arange(len(data)), q, method=interpolation)
 

@@ -578,7 +578,7 @@ def _transform_general(
             concatenated = concat(results, ignore_index=True)
             result = self._set_result_index_ordered(concatenated)
         else:
-            result = self.obj._constructor(dtype=np.float64)
+            result = self.obj._constructor(dtype=self.obj.dtype)
 
         result.name = self.obj.name
         return result

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1778,7 +1778,7 @@ def as_array(
         passed_nan = lib.is_float(na_value) and isna(na_value)
 
         if len(self.blocks) == 0:
-            arr = np.empty(self.shape, dtype=float)
+            arr = np.empty(self.shape, dtype=object)
             return arr.transpose()
 
         if self.is_single_block:

diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py
@@ -122,9 +122,9 @@ def test_compare_categorical_with_missing(self, a1, a2, categories):
         "na_value, dtype",
         [
             (pd.NaT, "datetime64[s]"),
-            (None, "float64"),
+            (None, "object"),
             (np.nan, "float64"),
-            (pd.NA, "float64"),
+            (pd.NA, "object"),
         ],
     )
     def test_categorical_only_missing_values_no_cast(self, na_value, dtype):

diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -81,7 +81,7 @@ def test_quantile(self, datetime_frame, interp_method, request):
     def test_empty(self, interp_method):
         interpolation, method = interp_method
         q = DataFrame({"x": [], "y": []}).quantile(
-            0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
+            0.1, axis=0, interpolation=interpolation, method=method
         )
         assert np.isnan(q["x"]) and np.isnan(q["y"])
 
@@ -320,7 +320,9 @@ def test_quantile_multi_empty(self, interp_method):
             [0.1, 0.9], axis=0, interpolation=interpolation, method=method
         )
         expected = DataFrame(
-            {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
+            {"x": [np.nan, np.nan], "y": [np.nan, np.nan]},
+            index=[0.1, 0.9],
+            dtype="object",
         )
         tm.assert_frame_equal(result, expected)
 
@@ -688,10 +690,8 @@ def test_quantile_empty_no_rows_dt64(self, interp_method):
         res = df.quantile(
             0.5, numeric_only=False, interpolation=interpolation, method=method
         )
-        exp = exp.astype(object)
-        if interpolation == "nearest":
-            # GH#18463 TODO: would we prefer NaTs here?
-            exp = exp.fillna(np.nan)
+        # GH#18463 TODO: would we prefer NaTs here?
+        exp = exp.astype(object).fillna(pd.NaT)
         tm.assert_series_equal(res, exp)
 
         # both dt64tz

diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
@@ -77,7 +77,7 @@ def test_setitem_reset_index_dtypes(self):
         df1["d"] = []
         result = df1.reset_index()
         expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype(
-            {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64}
+            {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.object_}
         )
         tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1627,7 +1627,7 @@ def test_min_max_dt64_api_consistency_empty_df(self):
         # check DataFrame/Series api consistency when calling min/max on an empty
         # DataFrame/Series.
         df = DataFrame({"x": []})
-        expected_float_series = Series([], dtype=float)
+        expected_float_series = Series([], dtype=object)
         # check axis 0
         assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
         assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())

diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1433,11 +1433,12 @@ def test_stack_timezone_aware_values(future_stack):
 def test_stack_empty_frame(dropna, future_stack):
     # GH 36113
     levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
-    expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
+    expected = Series(dtype=np.object_, index=MultiIndex(levels=levels, codes=[[], []]))
     if future_stack and dropna is not lib.no_default:
         with pytest.raises(ValueError, match="dropna must be unspecified"):
             DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
     else:
+        # dtype=np.float64 is lost since there are no columns
         result = DataFrame(dtype=np.float64).stack(
             dropna=dropna, future_stack=future_stack
         )
@@ -1627,7 +1628,9 @@ def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
             (
                 [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
                 ["ix1", "ix2", "col1", "col2", "col3"],
-                None,
+                # Nones are used as floats in the presence of numeric data,
+                # resulting in np.nan for index level 1.
+                np.nan,
                 [None, None, 30.0],
             ),
         ],
@@ -1639,10 +1642,12 @@ def test_unstack_partial(
         # https://github.com/pandas-dev/pandas/issues/19351
         # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
         # and the Index levels contain values that are not present in the subset
-        result = DataFrame(result_rows, columns=result_columns).set_index(
-            ["ix1", "ix2"]
+        data = (
+            DataFrame(result_rows, columns=result_columns)
+            .set_index(["ix1", "ix2"])
+            .iloc[1:2]
         )
-        result = result.iloc[1:2].unstack("ix2")
+        result = data.unstack("ix2")
         expected = DataFrame(
             [expected_row],
             columns=MultiIndex.from_product(

diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py
@@ -192,7 +192,12 @@ def test_quantile_missing_group_values_no_segfaults():
         ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
         (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
         ([0], [42], [0], [42.0]),
-        ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
+        (
+            np.array([], dtype="float64"),
+            np.array([], dtype="float64"),
+            np.array([], dtype="float64"),
+            np.array([], dtype="float64"),
+        ),
     ],
 )
 def test_quantile_missing_group_values_correct_results(

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1492,9 +1492,7 @@ def test_empty_df(method, op):
     group = getattr(gb, "b")
 
     result = getattr(group, method)(op)
-    expected = Series(
-        [], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
-    )
+    expected = Series([], name="b", index=Index([], name="a"))
 
     tm.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1116,10 +1116,10 @@ def convert_force_pure(x):
 def test_groupby_dtype_inference_empty():
     # GH 6733
     df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
-    assert df["x"].dtype == np.float64
+    assert df["x"].dtype == np.object_
 
     result = df.groupby("x").first()
-    exp_index = Index([], name="x", dtype=np.float64)
+    exp_index = Index([], name="x", dtype=np.object_)
     expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
     tm.assert_frame_equal(result, expected, by_blocks=True)
 

diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
@@ -737,22 +737,9 @@ def test_list_grouper_with_nat(self):
     @pytest.mark.parametrize(
         "func,expected",
         [
-            (
-                "transform",
-                Series(name=2, dtype=np.float64),
-            ),
-            (
-                "agg",
-                Series(
-                    name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
-                ),
-            ),
-            (
-                "apply",
-                Series(
-                    name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
-                ),
-            ),
+            ("transform", Series(name=2)),
+            ("agg", Series(name=2, index=Index([], name=1))),
+            ("apply", Series(name=2, index=Index([], name=1))),
         ],
     )
     def test_evaluate_with_empty_groups(self, func, expected):

diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
@@ -119,7 +119,6 @@ def test_partial_set_empty_frame3(self):
         expected = DataFrame(
             columns=Index(["foo"], dtype=object), index=Index([], dtype="int64")
         )
-        expected["foo"] = expected["foo"].astype("float64")
 
         df = DataFrame(index=Index([], dtype="int64"))
         df["foo"] = []
@@ -128,6 +127,11 @@ def test_partial_set_empty_frame3(self):
 
         df = DataFrame(index=Index([], dtype="int64"))
         df["foo"] = Series(np.arange(len(df)), dtype="float64")
+        expected = DataFrame(
+            columns=Index(["foo"], dtype=object),
+            index=Index([], dtype="int64"),
+            dtype="float64",
+        )
 
         tm.assert_frame_equal(df, expected)
 

diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -2010,7 +2010,7 @@ def test_resample_empty_series_with_tz():
     expected_idx = DatetimeIndex(
         [], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]"
     )
-    expected = Series([], index=expected_idx, name="values", dtype="float64")
+    expected = Series([], index=expected_idx, name="values")
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -572,7 +572,7 @@ def test_concat_empty_and_non_empty_frame_regression():
     # GH 18178 regression test
     df1 = DataFrame({"foo": [1]})
     df2 = DataFrame({"foo": []})
-    expected = DataFrame({"foo": [1.0]})
+    expected = DataFrame({"foo": [1]}, dtype="object")
     result = concat([df1, df2])
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py
@@ -90,7 +90,7 @@ def test_concat_empty_series_timelike(self, tz, values):
         expected = DataFrame(
             {
                 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
-                1: values,
+                1: Series(values, dtype=dtype),
             }
         )
         result = concat([first, second], axis=1)

diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
@@ -944,7 +944,14 @@ def test_invalid_separator(self):
             "A": [],
             "B": [],
         }
-        expected = DataFrame(exp_data).astype({"year": np.int64})
+        expected = DataFrame(exp_data).astype(
+            {
+                "A2010": np.float64,
+                "A2011": np.float64,
+                "B2010": np.float64,
+                "year": np.int64,
+            }
+        )
         expected = expected.set_index(["id", "year"])[
             ["X", "A2010", "A2011", "B2010", "A", "B"]
         ]
@@ -1007,7 +1014,14 @@ def test_invalid_suffixtype(self):
             "A": [],
             "B": [],
         }
-        expected = DataFrame(exp_data).astype({"year": np.int64})
+        expected = DataFrame(exp_data).astype(
+            {
+                "Aone": np.float64,
+                "Atwo": np.float64,
+                "Bone": np.float64,
+                "year": np.int64,
+            }
+        )
 
         expected = expected.set_index(["id", "year"])
         expected.index = expected.index.set_levels([0, 1], level=0)
@@ -1231,7 +1245,7 @@ def test_missing_stubname(self, dtype):
             name=("id", "num"),
         )
         expected = DataFrame(
-            {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
+            {"a": [100, 200, 300, 400], "b": pd.Series([np.nan] * 4, dtype="object")},
             index=index,
         )
         new_level = expected.index.levels[0].astype(dtype)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -1412,7 +1412,7 @@ def test_constructor_dict_tuple_indexer(self):
         data = {(1, 1, None): -1.0}
         result = Series(data)
         expected = Series(
-            -1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]])
+            -1.0, index=MultiIndex(levels=[[1], [1], []], codes=[[0], [0], [-1]])
         )
         tm.assert_series_equal(result, expected)