Skip to content

Commit 2c77567

Browse files
authored
DEPR: DataFrame dtype keyword match Series behavior (#49313)
1 parent 9c9789c commit 2c77567

File tree

6 files changed

+43
-103
lines changed

6 files changed

+43
-103
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ Removal of prior version deprecations/changes
252252
- Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
253253
- Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
254254
- Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
255+
- Changed behavior of :class:`DataFrame` constructor given floating-point ``data`` and an integer ``dtype``, when the data cannot be cast losslessly, the floating point dtype is retained, matching :class:`Series` behavior (:issue:`41170`)
256+
- Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`)
255257
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
256258
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
257259

pandas/core/construction.py

Lines changed: 15 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,6 @@ def sanitize_array(
500500
index: Index | None,
501501
dtype: DtypeObj | None = None,
502502
copy: bool = False,
503-
raise_cast_failure: bool = True,
504503
*,
505504
allow_2d: bool = False,
506505
) -> ArrayLike:
@@ -514,19 +513,12 @@ def sanitize_array(
514513
index : Index or None, default None
515514
dtype : np.dtype, ExtensionDtype, or None, default None
516515
copy : bool, default False
517-
raise_cast_failure : bool, default True
518516
allow_2d : bool, default False
519517
If False, raise if we have a 2D Arraylike.
520518
521519
Returns
522520
-------
523521
np.ndarray or ExtensionArray
524-
525-
Notes
526-
-----
527-
raise_cast_failure=False is only intended to be True when called from the
528-
DataFrame constructor, as the dtype keyword there may be interpreted as only
529-
applying to a subset of columns, see GH#24435.
530522
"""
531523
if isinstance(data, ma.MaskedArray):
532524
data = sanitize_masked_array(data)
@@ -564,7 +556,7 @@ def sanitize_array(
564556
# GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
565557
# casting aligning with IntCastingNaNError below
566558
with np.errstate(invalid="ignore"):
567-
subarr = _try_cast(data, dtype, copy, True)
559+
subarr = _try_cast(data, dtype, copy)
568560
except IntCastingNaNError:
569561
warnings.warn(
570562
"In a future version, passing float-dtype values containing NaN "
@@ -577,29 +569,18 @@ def sanitize_array(
577569
)
578570
subarr = np.array(data, copy=copy)
579571
except ValueError:
580-
if not raise_cast_failure:
581-
# i.e. called via DataFrame constructor
582-
warnings.warn(
583-
"In a future version, passing float-dtype values and an "
584-
"integer dtype to DataFrame will retain floating dtype "
585-
"if they cannot be cast losslessly (matching Series behavior). "
586-
"To retain the old behavior, use DataFrame(data).astype(dtype)",
587-
FutureWarning,
588-
stacklevel=find_stack_level(),
589-
)
590-
# GH#40110 until the deprecation is enforced, we _dont_
591-
# ignore the dtype for DataFrame, and _do_ cast even though
592-
# it is lossy.
593-
dtype = cast(np.dtype, dtype)
594-
return np.array(data, dtype=dtype, copy=copy)
572+
# Pre-2.0, we would have different behavior for Series vs DataFrame.
573+
# DataFrame would call np.array(data, dtype=dtype, copy=copy),
574+
# which would cast to the integer dtype even if the cast is lossy.
575+
# See GH#40110.
595576

596577
# We ignore the dtype arg and return floating values,
597578
# e.g. test_constructor_floating_data_int_dtype
598579
# TODO: where is the discussion that documents the reason for this?
599580
subarr = np.array(data, copy=copy)
600581
else:
601582
# we will try to copy by-definition here
602-
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
583+
subarr = _try_cast(data, dtype, copy)
603584

604585
elif isinstance(data, ABCExtensionArray):
605586
# it is already ensured above this is not a PandasArray
@@ -624,7 +605,7 @@ def sanitize_array(
624605

625606
if dtype is not None or len(data) == 0:
626607
try:
627-
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
608+
subarr = _try_cast(data, dtype, copy)
628609
except ValueError:
629610
if is_integer_dtype(dtype):
630611
casted = np.array(data, copy=False)
@@ -636,7 +617,6 @@ def sanitize_array(
636617
index,
637618
dtype,
638619
copy=False,
639-
raise_cast_failure=raise_cast_failure,
640620
allow_2d=allow_2d,
641621
)
642622
else:
@@ -750,7 +730,6 @@ def _try_cast(
750730
arr: list | np.ndarray,
751731
dtype: DtypeObj | None,
752732
copy: bool,
753-
raise_cast_failure: bool,
754733
) -> ArrayLike:
755734
"""
756735
Convert input to numpy ndarray and optionally cast to a given dtype.
@@ -762,9 +741,6 @@ def _try_cast(
762741
dtype : np.dtype, ExtensionDtype or None
763742
copy : bool
764743
If False, don't copy the data if not needed.
765-
raise_cast_failure : bool
766-
If True, and if a dtype is specified, raise errors during casting.
767-
Otherwise an object array is returned.
768744
769745
Returns
770746
-------
@@ -823,35 +799,15 @@ def _try_cast(
823799
elif dtype.kind in ["m", "M"]:
824800
return maybe_cast_to_datetime(arr, dtype)
825801

826-
try:
827-
# GH#15832: Check if we are requesting a numeric dtype and
828-
# that we can convert the data to the requested dtype.
829-
if is_integer_dtype(dtype):
830-
# this will raise if we have e.g. floats
802+
# GH#15832: Check if we are requesting a numeric dtype and
803+
# that we can convert the data to the requested dtype.
804+
elif is_integer_dtype(dtype):
805+
# this will raise if we have e.g. floats
806+
807+
subarr = maybe_cast_to_integer_array(arr, dtype)
808+
else:
809+
subarr = np.array(arr, dtype=dtype, copy=copy)
831810

832-
subarr = maybe_cast_to_integer_array(arr, dtype)
833-
else:
834-
# 4 tests fail if we move this to a try/except/else; see
835-
# test_constructor_compound_dtypes, test_constructor_cast_failure
836-
# test_constructor_dict_cast2, test_loc_setitem_dtype
837-
subarr = np.array(arr, dtype=dtype, copy=copy)
838-
839-
except (ValueError, TypeError):
840-
if raise_cast_failure:
841-
raise
842-
else:
843-
# we only get here with raise_cast_failure False, which means
844-
# called via the DataFrame constructor
845-
# GH#24435
846-
warnings.warn(
847-
f"Could not cast to {dtype}, falling back to object. This "
848-
"behavior is deprecated. In a future version, when a dtype is "
849-
"passed to 'DataFrame', either all columns will be cast to that "
850-
"dtype, or a TypeError will be raised.",
851-
FutureWarning,
852-
stacklevel=find_stack_level(),
853-
)
854-
subarr = np.array(arr, dtype=object, copy=copy)
855811
return subarr
856812

857813

pandas/core/internals/construction.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -331,14 +331,11 @@ def ndarray_to_mgr(
331331

332332
if dtype is not None and not is_dtype_equal(values.dtype, dtype):
333333
# GH#40110 see similar check inside sanitize_array
334-
rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")
335-
336334
values = sanitize_array(
337335
values,
338336
None,
339337
dtype=dtype,
340338
copy=copy_on_sanitize,
341-
raise_cast_failure=rcf,
342339
allow_2d=True,
343340
)
344341

@@ -615,9 +612,7 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
615612
val = dict(val)
616613
val = lib.fast_multiget(val, oindex._values, default=np.nan)
617614

618-
val = sanitize_array(
619-
val, index, dtype=dtype, copy=False, raise_cast_failure=False
620-
)
615+
val = sanitize_array(val, index, dtype=dtype, copy=False)
621616
com.require_length_match(val, index)
622617

623618
homogenized.append(val)

pandas/tests/frame/test_block_internals.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,10 @@ def f(dtype):
259259
with pytest.raises(NotImplementedError, match=msg):
260260
f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
261261

262-
# these work (though results may be unexpected)
263-
depr_msg = "either all columns will be cast to that dtype, or a TypeError will"
264-
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
262+
# pre-2.0 these used to work (though results may be unexpected)
263+
with pytest.raises(TypeError, match="argument must be"):
265264
f("int64")
266-
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
265+
with pytest.raises(TypeError, match="argument must be"):
267266
f("float64")
268267

269268
# 10822

pandas/tests/frame/test_constructors.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,11 @@ def test_constructor_mixed(self, float_string_frame):
245245
assert float_string_frame["foo"].dtype == np.object_
246246

247247
def test_constructor_cast_failure(self):
248-
msg = "either all columns will be cast to that dtype, or a TypeError will"
249-
with tm.assert_produces_warning(FutureWarning, match=msg):
250-
foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
251-
assert foo["a"].dtype == object
248+
# as of 2.0, we raise if we can't respect "dtype", previously we
249+
# silently ignored
250+
msg = "could not convert string to float"
251+
with pytest.raises(ValueError, match=msg):
252+
DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
252253

253254
# GH 3010, constructing with odd arrays
254255
df = DataFrame(np.ones((4, 2)))
@@ -753,13 +754,8 @@ def test_constructor_dict_cast2(self):
753754
"A": dict(zip(range(20), tm.makeStringIndex(20))),
754755
"B": dict(zip(range(15), np.random.randn(15))),
755756
}
756-
msg = "either all columns will be cast to that dtype, or a TypeError will"
757-
with tm.assert_produces_warning(FutureWarning, match=msg):
758-
frame = DataFrame(test_data, dtype=float)
759-
760-
assert len(frame) == 20
761-
assert frame["A"].dtype == np.object_
762-
assert frame["B"].dtype == np.float64
757+
with pytest.raises(ValueError, match="could not convert string"):
758+
DataFrame(test_data, dtype=float)
763759

764760
def test_constructor_dict_dont_upcast(self):
765761
d = {"Col1": {"Row1": "A String", "Row2": np.nan}}
@@ -2788,13 +2784,14 @@ def test_floating_values_integer_dtype(self):
27882784

27892785
arr = np.random.randn(10, 5)
27902786

2791-
msg = "if they cannot be cast losslessly"
2792-
with tm.assert_produces_warning(FutureWarning, match=msg):
2793-
DataFrame(arr, dtype="i8")
2787+
# as of 2.0, we match Series behavior by retaining float dtype instead
2788+
# of doing a lossy conversion here. Below we _do_ do the conversion
2789+
# since it is lossless.
2790+
df = DataFrame(arr, dtype="i8")
2791+
assert (df.dtypes == "f8").all()
27942792

2795-
with tm.assert_produces_warning(None):
2796-
# if they can be cast losslessly, no warning
2797-
DataFrame(arr.round(), dtype="i8")
2793+
df = DataFrame(arr.round(), dtype="i8")
2794+
assert (df.dtypes == "i8").all()
27982795

27992796
# with NaNs, we go through a different path with a different warning
28002797
arr[0, 0] = np.nan

pandas/tests/series/test_constructors.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -782,25 +782,16 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series):
782782
# GH#40110
783783
arr = np.random.randn(2)
784784

785-
if frame_or_series is Series:
786-
# Long-standing behavior has been to ignore the dtype on these;
787-
# not clear if this is what we want long-term
788-
expected = frame_or_series(arr)
789-
790-
res = frame_or_series(arr, dtype="i8")
791-
tm.assert_equal(res, expected)
785+
# Long-standing behavior (for Series, new in 2.0 for DataFrame)
786+
# has been to ignore the dtype on these;
787+
# not clear if this is what we want long-term
788+
expected = frame_or_series(arr)
792789

793-
res = frame_or_series(list(arr), dtype="i8")
794-
tm.assert_equal(res, expected)
790+
res = frame_or_series(arr, dtype="i8")
791+
tm.assert_equal(res, expected)
795792

796-
else:
797-
msg = "passing float-dtype values and an integer dtype"
798-
with tm.assert_produces_warning(FutureWarning, match=msg):
799-
# DataFrame will behave like Series
800-
frame_or_series(arr, dtype="i8")
801-
with tm.assert_produces_warning(FutureWarning, match=msg):
802-
# DataFrame will behave like Series
803-
frame_or_series(list(arr), dtype="i8")
793+
res = frame_or_series(list(arr), dtype="i8")
794+
tm.assert_equal(res, expected)
804795

805796
# When we have NaNs, we silently ignore the integer dtype
806797
arr[0] = np.nan

0 commit comments

Comments
 (0)