Skip to content

BUG: Empty DataFrame constructor dtype #58669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,8 +642,8 @@ def sanitize_array(
data = list(data)

if len(data) == 0 and dtype is None:
# We default to float64, matching numpy
subarr = np.array([], dtype=np.float64)
# We default to object, diverging from NumPy
subarr = np.array([], dtype=np.object_)

elif dtype is not None:
subarr = _try_cast(data, dtype, copy)
Expand Down
45 changes: 29 additions & 16 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
can_hold_element,
construct_1d_arraylike_from_scalar,
construct_2d_arraylike_from_scalar,
ensure_dtype_can_hold_na,
find_common_type,
infer_dtype_from_scalar,
invalidate_string_dtypes,
Expand Down Expand Up @@ -13052,6 +13053,8 @@ def quantile(
C 1 days 12:00:00
Name: 0.5, dtype: object
"""
from pandas.core.dtypes.common import is_object_dtype

validate_percentile(q)
axis = self._get_axis_number(axis)

Expand All @@ -13066,23 +13069,25 @@ def quantile(
interpolation=interpolation,
method=method,
)
if method == "single":
res = res_df.iloc[0]
else:
# cannot directly iloc over sparse arrays
res = res_df.T.iloc[:, 0]
res = res_df.iloc[0]
if axis == 1 and len(self) == 0:
# GH#41544 try to get an appropriate dtype
dtype = find_common_type(list(self.dtypes))
if needs_i8_conversion(dtype):
return res.astype(dtype)
dtype = "float64"
cdtype = find_common_type(list(self.dtypes))
if needs_i8_conversion(cdtype) or is_object_dtype(cdtype):
dtype = cdtype
return res.astype(dtype)
return res

q = Index(q, dtype=np.float64)
data = self._get_numeric_data() if numeric_only else self

if axis == 1:
data = data.T
if data.shape[0] == 0:
# The transpose has no rows, so the original has no columns, meaning we
# have no dtype information. Since this is quantile, default to float64
data = data.astype("float64")

if len(data.columns) == 0:
# GH#23925 _get_numeric_data may have dropped all columns
Expand All @@ -13092,7 +13097,7 @@ def quantile(
if axis == 1:
# GH#41544 try to get an appropriate dtype
cdtype = find_common_type(list(self.dtypes))
if needs_i8_conversion(cdtype):
if needs_i8_conversion(cdtype) or is_object_dtype(cdtype):
dtype = cdtype

res = self._constructor([], index=q, columns=cols, dtype=dtype)
Expand All @@ -13103,6 +13108,21 @@ def quantile(
raise ValueError(
f"Invalid method: {method}. Method must be in {valid_method}."
)

# handle degenerate case
if len(data) == 0:
from pandas import array

result = self._constructor(
{
idx: array(len(q) * [np.nan], dtype=ensure_dtype_can_hold_na(dtype))
for idx, dtype in enumerate(data.dtypes)
},
index=q,
)
result.columns = data.columns
return result

if method == "single":
res = data._mgr.quantile(qs=q, interpolation=interpolation)
elif method == "table":
Expand All @@ -13112,13 +13132,6 @@ def quantile(
f"Invalid interpolation: {interpolation}. "
f"Interpolation must be in {valid_interpolation}"
)
# handle degenerate case
if len(data) == 0:
if data.ndim == 2:
dtype = find_common_type(list(self.dtypes))
else:
dtype = self.dtype
return self._constructor([], index=q, columns=data.columns, dtype=dtype)

q_idx = np.quantile(np.arange(len(data)), q, method=interpolation)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def _transform_general(
concatenated = concat(results, ignore_index=True)
result = self._set_result_index_ordered(concatenated)
else:
result = self.obj._constructor(dtype=np.float64)
result = self.obj._constructor(dtype=self.obj.dtype)

result.name = self.obj.name
return result
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1778,7 +1778,7 @@ def as_array(
passed_nan = lib.is_float(na_value) and isna(na_value)

if len(self.blocks) == 0:
arr = np.empty(self.shape, dtype=float)
arr = np.empty(self.shape, dtype=object)
return arr.transpose()

if self.is_single_block:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,9 @@ def test_compare_categorical_with_missing(self, a1, a2, categories):
"na_value, dtype",
[
(pd.NaT, "datetime64[s]"),
(None, "float64"),
(None, "object"),
(np.nan, "float64"),
(pd.NA, "float64"),
(pd.NA, "object"),
],
)
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
Expand Down
12 changes: 6 additions & 6 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_quantile(self, datetime_frame, interp_method, request):
def test_empty(self, interp_method):
interpolation, method = interp_method
q = DataFrame({"x": [], "y": []}).quantile(
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
0.1, axis=0, interpolation=interpolation, method=method
)
assert np.isnan(q["x"]) and np.isnan(q["y"])

Expand Down Expand Up @@ -320,7 +320,9 @@ def test_quantile_multi_empty(self, interp_method):
[0.1, 0.9], axis=0, interpolation=interpolation, method=method
)
expected = DataFrame(
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]},
index=[0.1, 0.9],
dtype="object",
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -688,10 +690,8 @@ def test_quantile_empty_no_rows_dt64(self, interp_method):
res = df.quantile(
0.5, numeric_only=False, interpolation=interpolation, method=method
)
exp = exp.astype(object)
if interpolation == "nearest":
# GH#18463 TODO: would we prefer NaTs here?
exp = exp.fillna(np.nan)
# GH#18463 TODO: would we prefer NaTs here?
exp = exp.astype(object).fillna(pd.NaT)
tm.assert_series_equal(res, exp)

# both dt64tz
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_setitem_reset_index_dtypes(self):
df1["d"] = []
result = df1.reset_index()
expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype(
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64}
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.object_}
)
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,7 +1627,7 @@ def test_min_max_dt64_api_consistency_empty_df(self):
# check DataFrame/Series api consistency when calling min/max on an empty
# DataFrame/Series.
df = DataFrame({"x": []})
expected_float_series = Series([], dtype=float)
expected_float_series = Series([], dtype=object)
# check axis 0
assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
Expand Down
15 changes: 10 additions & 5 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1433,11 +1433,12 @@ def test_stack_timezone_aware_values(future_stack):
def test_stack_empty_frame(dropna, future_stack):
# GH 36113
levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
expected = Series(dtype=np.object_, index=MultiIndex(levels=levels, codes=[[], []]))
if future_stack and dropna is not lib.no_default:
with pytest.raises(ValueError, match="dropna must be unspecified"):
DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
else:
# dtype=np.float64 is lost since there are no columns
result = DataFrame(dtype=np.float64).stack(
dropna=dropna, future_stack=future_stack
)
Expand Down Expand Up @@ -1627,7 +1628,9 @@ def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
(
[[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
["ix1", "ix2", "col1", "col2", "col3"],
None,
# Nones are used as floats in the presence of numeric data,
# resulting in np.nan for index level 1.
np.nan,
[None, None, 30.0],
),
],
Expand All @@ -1639,10 +1642,12 @@ def test_unstack_partial(
# https://github.com/pandas-dev/pandas/issues/19351
# make sure DataFrame.unstack() works when its run on a subset of the DataFrame
# and the Index levels contain values that are not present in the subset
result = DataFrame(result_rows, columns=result_columns).set_index(
["ix1", "ix2"]
data = (
DataFrame(result_rows, columns=result_columns)
.set_index(["ix1", "ix2"])
.iloc[1:2]
)
result = result.iloc[1:2].unstack("ix2")
result = data.unstack("ix2")
expected = DataFrame(
[expected_row],
columns=MultiIndex.from_product(
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,12 @@ def test_quantile_missing_group_values_no_segfaults():
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
([0], [42], [0], [42.0]),
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
(
np.array([], dtype="float64"),
np.array([], dtype="float64"),
np.array([], dtype="float64"),
np.array([], dtype="float64"),
),
],
)
def test_quantile_missing_group_values_correct_results(
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1492,9 +1492,7 @@ def test_empty_df(method, op):
group = getattr(gb, "b")

result = getattr(group, method)(op)
expected = Series(
[], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
)
expected = Series([], name="b", index=Index([], name="a"))

tm.assert_series_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,10 +1116,10 @@ def convert_force_pure(x):
def test_groupby_dtype_inference_empty():
# GH 6733
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
assert df["x"].dtype == np.float64
assert df["x"].dtype == np.object_

result = df.groupby("x").first()
exp_index = Index([], name="x", dtype=np.float64)
exp_index = Index([], name="x", dtype=np.object_)
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
tm.assert_frame_equal(result, expected, by_blocks=True)

Expand Down
19 changes: 3 additions & 16 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,22 +737,9 @@ def test_list_grouper_with_nat(self):
@pytest.mark.parametrize(
"func,expected",
[
(
"transform",
Series(name=2, dtype=np.float64),
),
(
"agg",
Series(
name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
),
),
(
"apply",
Series(
name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
),
),
("transform", Series(name=2)),
("agg", Series(name=2, index=Index([], name=1))),
("apply", Series(name=2, index=Index([], name=1))),
],
)
def test_evaluate_with_empty_groups(self, func, expected):
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexing/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def test_partial_set_empty_frame3(self):
expected = DataFrame(
columns=Index(["foo"], dtype=object), index=Index([], dtype="int64")
)
expected["foo"] = expected["foo"].astype("float64")

df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = []
Expand All @@ -128,6 +127,11 @@ def test_partial_set_empty_frame3(self):

df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = Series(np.arange(len(df)), dtype="float64")
expected = DataFrame(
columns=Index(["foo"], dtype=object),
index=Index([], dtype="int64"),
dtype="float64",
)

tm.assert_frame_equal(df, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2010,7 +2010,7 @@ def test_resample_empty_series_with_tz():
expected_idx = DatetimeIndex(
[], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]"
)
expected = Series([], index=expected_idx, name="values", dtype="float64")
expected = Series([], index=expected_idx, name="values")
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def test_concat_empty_and_non_empty_frame_regression():
# GH 18178 regression test
df1 = DataFrame({"foo": [1]})
df2 = DataFrame({"foo": []})
expected = DataFrame({"foo": [1.0]})
expected = DataFrame({"foo": [1]}, dtype="object")
result = concat([df1, df2])
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/concat/test_empty.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_concat_empty_series_timelike(self, tz, values):
expected = DataFrame(
{
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
1: values,
1: Series(values, dtype=dtype),
}
)
result = concat([first, second], axis=1)
Expand Down
20 changes: 17 additions & 3 deletions pandas/tests/reshape/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,14 @@ def test_invalid_separator(self):
"A": [],
"B": [],
}
expected = DataFrame(exp_data).astype({"year": np.int64})
expected = DataFrame(exp_data).astype(
{
"A2010": np.float64,
"A2011": np.float64,
"B2010": np.float64,
"year": np.int64,
}
)
expected = expected.set_index(["id", "year"])[
["X", "A2010", "A2011", "B2010", "A", "B"]
]
Expand Down Expand Up @@ -1007,7 +1014,14 @@ def test_invalid_suffixtype(self):
"A": [],
"B": [],
}
expected = DataFrame(exp_data).astype({"year": np.int64})
expected = DataFrame(exp_data).astype(
{
"Aone": np.float64,
"Atwo": np.float64,
"Bone": np.float64,
"year": np.int64,
}
)

expected = expected.set_index(["id", "year"])
expected.index = expected.index.set_levels([0, 1], level=0)
Expand Down Expand Up @@ -1231,7 +1245,7 @@ def test_missing_stubname(self, dtype):
name=("id", "num"),
)
expected = DataFrame(
{"a": [100, 200, 300, 400], "b": [np.nan] * 4},
{"a": [100, 200, 300, 400], "b": pd.Series([np.nan] * 4, dtype="object")},
index=index,
)
new_level = expected.index.levels[0].astype(dtype)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1412,7 +1412,7 @@ def test_constructor_dict_tuple_indexer(self):
data = {(1, 1, None): -1.0}
result = Series(data)
expected = Series(
-1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]])
-1.0, index=MultiIndex(levels=[[1], [1], []], codes=[[0], [0], [-1]])
)
tm.assert_series_equal(result, expected)

Expand Down
Loading
Loading