DEPR: DataFrame dtype keyword match Series behavior (#49313)

jbrockmendel · web-flow · commit 2c775676b7f9 · 2022-10-26T13:34:00.000-07:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -252,6 +252,8 @@ Removal of prior version deprecations/changes
 - Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
 - Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
 - Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
+- Changed behavior of :class:`DataFrame` constructor given floating-point ``data`` and an integer ``dtype``, when the data cannot be cast losslessly, the floating point dtype is retained, matching :class:`Series` behavior (:issue:`41170`)
+- Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`)
 - Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
 - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
 
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -500,7 +500,6 @@ def sanitize_array(
     index: Index | None,
     dtype: DtypeObj | None = None,
     copy: bool = False,
-    raise_cast_failure: bool = True,
     *,
     allow_2d: bool = False,
 ) -> ArrayLike:
@@ -514,19 +513,12 @@ def sanitize_array(
     index : Index or None, default None
     dtype : np.dtype, ExtensionDtype, or None, default None
     copy : bool, default False
-    raise_cast_failure : bool, default True
     allow_2d : bool, default False
         If False, raise if we have a 2D Arraylike.
 
     Returns
     -------
     np.ndarray or ExtensionArray
-
-    Notes
-    -----
-    raise_cast_failure=False is only intended to be True when called from the
-    DataFrame constructor, as the dtype keyword there may be interpreted as only
-    applying to a subset of columns, see GH#24435.
     """
     if isinstance(data, ma.MaskedArray):
         data = sanitize_masked_array(data)
@@ -564,7 +556,7 @@ def sanitize_array(
                 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
                 # casting aligning with IntCastingNaNError below
                 with np.errstate(invalid="ignore"):
-                    subarr = _try_cast(data, dtype, copy, True)
+                    subarr = _try_cast(data, dtype, copy)
             except IntCastingNaNError:
                 warnings.warn(
                     "In a future version, passing float-dtype values containing NaN "
@@ -577,29 +569,18 @@ def sanitize_array(
                 )
                 subarr = np.array(data, copy=copy)
             except ValueError:
-                if not raise_cast_failure:
-                    # i.e. called via DataFrame constructor
-                    warnings.warn(
-                        "In a future version, passing float-dtype values and an "
-                        "integer dtype to DataFrame will retain floating dtype "
-                        "if they cannot be cast losslessly (matching Series behavior). "
-                        "To retain the old behavior, use DataFrame(data).astype(dtype)",
-                        FutureWarning,
-                        stacklevel=find_stack_level(),
-                    )
-                    # GH#40110 until the deprecation is enforced, we _dont_
-                    #  ignore the dtype for DataFrame, and _do_ cast even though
-                    #  it is lossy.
-                    dtype = cast(np.dtype, dtype)
-                    return np.array(data, dtype=dtype, copy=copy)
+                # Pre-2.0, we would have different behavior for Series vs DataFrame.
+                #  DataFrame would call np.array(data, dtype=dtype, copy=copy),
+                #  which would cast to the integer dtype even if the cast is lossy.
+                #  See GH#40110.
 
                 # We ignore the dtype arg and return floating values,
                 #  e.g. test_constructor_floating_data_int_dtype
                 # TODO: where is the discussion that documents the reason for this?
                 subarr = np.array(data, copy=copy)
         else:
             # we will try to copy by-definition here
-            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
+            subarr = _try_cast(data, dtype, copy)
 
     elif isinstance(data, ABCExtensionArray):
         # it is already ensured above this is not a PandasArray
@@ -624,7 +605,7 @@ def sanitize_array(
 
         if dtype is not None or len(data) == 0:
             try:
-                subarr = _try_cast(data, dtype, copy, raise_cast_failure)
+                subarr = _try_cast(data, dtype, copy)
             except ValueError:
                 if is_integer_dtype(dtype):
                     casted = np.array(data, copy=False)
@@ -636,7 +617,6 @@ def sanitize_array(
                             index,
                             dtype,
                             copy=False,
-                            raise_cast_failure=raise_cast_failure,
                             allow_2d=allow_2d,
                         )
                     else:
@@ -750,7 +730,6 @@ def _try_cast(
     arr: list | np.ndarray,
     dtype: DtypeObj | None,
     copy: bool,
-    raise_cast_failure: bool,
 ) -> ArrayLike:
     """
     Convert input to numpy ndarray and optionally cast to a given dtype.
@@ -762,9 +741,6 @@ def _try_cast(
     dtype : np.dtype, ExtensionDtype or None
     copy : bool
         If False, don't copy the data if not needed.
-    raise_cast_failure : bool
-        If True, and if a dtype is specified, raise errors during casting.
-        Otherwise an object array is returned.
 
     Returns
     -------
@@ -823,35 +799,15 @@ def _try_cast(
     elif dtype.kind in ["m", "M"]:
         return maybe_cast_to_datetime(arr, dtype)
 
-    try:
-        # GH#15832: Check if we are requesting a numeric dtype and
-        # that we can convert the data to the requested dtype.
-        if is_integer_dtype(dtype):
-            # this will raise if we have e.g. floats
+    # GH#15832: Check if we are requesting a numeric dtype and
+    # that we can convert the data to the requested dtype.
+    elif is_integer_dtype(dtype):
+        # this will raise if we have e.g. floats
+
+        subarr = maybe_cast_to_integer_array(arr, dtype)
+    else:
+        subarr = np.array(arr, dtype=dtype, copy=copy)
 
-            subarr = maybe_cast_to_integer_array(arr, dtype)
-        else:
-            # 4 tests fail if we move this to a try/except/else; see
-            #  test_constructor_compound_dtypes, test_constructor_cast_failure
-            #  test_constructor_dict_cast2, test_loc_setitem_dtype
-            subarr = np.array(arr, dtype=dtype, copy=copy)
-
-    except (ValueError, TypeError):
-        if raise_cast_failure:
-            raise
-        else:
-            # we only get here with raise_cast_failure False, which means
-            #  called via the DataFrame constructor
-            # GH#24435
-            warnings.warn(
-                f"Could not cast to {dtype}, falling back to object. This "
-                "behavior is deprecated. In a future version, when a dtype is "
-                "passed to 'DataFrame', either all columns will be cast to that "
-                "dtype, or a TypeError will be raised.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-            subarr = np.array(arr, dtype=object, copy=copy)
     return subarr
 
 
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -331,14 +331,11 @@ def ndarray_to_mgr(
 
     if dtype is not None and not is_dtype_equal(values.dtype, dtype):
         # GH#40110 see similar check inside sanitize_array
-        rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")
-
         values = sanitize_array(
             values,
             None,
             dtype=dtype,
             copy=copy_on_sanitize,
-            raise_cast_failure=rcf,
             allow_2d=True,
         )
 
@@ -615,9 +612,7 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
                     val = dict(val)
                 val = lib.fast_multiget(val, oindex._values, default=np.nan)
 
-            val = sanitize_array(
-                val, index, dtype=dtype, copy=False, raise_cast_failure=False
-            )
+            val = sanitize_array(val, index, dtype=dtype, copy=False)
             com.require_length_match(val, index)
 
         homogenized.append(val)
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -259,11 +259,10 @@ def f(dtype):
         with pytest.raises(NotImplementedError, match=msg):
             f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
 
-        # these work (though results may be unexpected)
-        depr_msg = "either all columns will be cast to that dtype, or a TypeError will"
-        with tm.assert_produces_warning(FutureWarning, match=depr_msg):
+        # pre-2.0 these used to work (though results may be unexpected)
+        with pytest.raises(TypeError, match="argument must be"):
             f("int64")
-        with tm.assert_produces_warning(FutureWarning, match=depr_msg):
+        with pytest.raises(TypeError, match="argument must be"):
             f("float64")
 
         # 10822
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -245,10 +245,11 @@ def test_constructor_mixed(self, float_string_frame):
         assert float_string_frame["foo"].dtype == np.object_
 
     def test_constructor_cast_failure(self):
-        msg = "either all columns will be cast to that dtype, or a TypeError will"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
-        assert foo["a"].dtype == object
+        # as of 2.0, we raise if we can't respect "dtype", previously we
+        #  silently ignored
+        msg = "could not convert string to float"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
 
         # GH 3010, constructing with odd arrays
         df = DataFrame(np.ones((4, 2)))
@@ -753,13 +754,8 @@ def test_constructor_dict_cast2(self):
             "A": dict(zip(range(20), tm.makeStringIndex(20))),
             "B": dict(zip(range(15), np.random.randn(15))),
         }
-        msg = "either all columns will be cast to that dtype, or a TypeError will"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            frame = DataFrame(test_data, dtype=float)
-
-        assert len(frame) == 20
-        assert frame["A"].dtype == np.object_
-        assert frame["B"].dtype == np.float64
+        with pytest.raises(ValueError, match="could not convert string"):
+            DataFrame(test_data, dtype=float)
 
     def test_constructor_dict_dont_upcast(self):
         d = {"Col1": {"Row1": "A String", "Row2": np.nan}}
@@ -2788,13 +2784,14 @@ def test_floating_values_integer_dtype(self):
 
         arr = np.random.randn(10, 5)
 
-        msg = "if they cannot be cast losslessly"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            DataFrame(arr, dtype="i8")
+        # as of 2.0, we match Series behavior by retaining float dtype instead
+        #  of doing a lossy conversion here. Below we _do_ do the conversion
+        #  since it is lossless.
+        df = DataFrame(arr, dtype="i8")
+        assert (df.dtypes == "f8").all()
 
-        with tm.assert_produces_warning(None):
-            # if they can be cast losslessly, no warning
-            DataFrame(arr.round(), dtype="i8")
+        df = DataFrame(arr.round(), dtype="i8")
+        assert (df.dtypes == "i8").all()
 
         # with NaNs, we go through a different path with a different warning
         arr[0, 0] = np.nan
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -782,25 +782,16 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series):
         # GH#40110
         arr = np.random.randn(2)
 
-        if frame_or_series is Series:
-            # Long-standing behavior has been to ignore the dtype on these;
-            #  not clear if this is what we want long-term
-            expected = frame_or_series(arr)
-
-            res = frame_or_series(arr, dtype="i8")
-            tm.assert_equal(res, expected)
+        # Long-standing behavior (for Series, new in 2.0 for DataFrame)
+        #  has been to ignore the dtype on these;
+        #  not clear if this is what we want long-term
+        expected = frame_or_series(arr)
 
-            res = frame_or_series(list(arr), dtype="i8")
-            tm.assert_equal(res, expected)
+        res = frame_or_series(arr, dtype="i8")
+        tm.assert_equal(res, expected)
 
-        else:
-            msg = "passing float-dtype values and an integer dtype"
-            with tm.assert_produces_warning(FutureWarning, match=msg):
-                # DataFrame will behave like Series
-                frame_or_series(arr, dtype="i8")
-            with tm.assert_produces_warning(FutureWarning, match=msg):
-                # DataFrame will behave like Series
-                frame_or_series(list(arr), dtype="i8")
+        res = frame_or_series(list(arr), dtype="i8")
+        tm.assert_equal(res, expected)
 
         # When we have NaNs, we silently ignore the integer dtype
         arr[0] = np.nan