ENH: include conversion to nullable float in convert_dtypes() (#38117)

jorisvandenbossche · web-flow · commit 4a35f2d6ecd1 · 2020-11-29T14:12:54.000-05:00
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1196,6 +1196,7 @@ def convert_dtypes(
     convert_string: bool = True,
     convert_integer: bool = True,
     convert_boolean: bool = True,
+    convert_floating: bool = True,
 ) -> Dtype:
     """
     Convert objects to best possible type, and optionally,
@@ -1210,14 +1211,20 @@ def convert_dtypes(
         Whether, if possible, conversion can be done to integer extension types.
     convert_boolean : bool, defaults True
         Whether object dtypes should be converted to ``BooleanDtypes()``.
+    convert_floating : bool, defaults True
+        Whether, if possible, conversion can be done to floating extension types.
+        If `convert_integer` is also True, preference will be give to integer
+        dtypes if the floats can be faithfully casted to integers.
 
     Returns
     -------
     dtype
         new dtype
     """
     is_extension = is_extension_array_dtype(input_array.dtype)
-    if (convert_string or convert_integer or convert_boolean) and not is_extension:
+    if (
+        convert_string or convert_integer or convert_boolean or convert_floating
+    ) and not is_extension:
         try:
             inferred_dtype = lib.infer_dtype(input_array)
         except ValueError:
@@ -1245,6 +1252,29 @@ def convert_dtypes(
             if is_integer_dtype(inferred_dtype):
                 inferred_dtype = input_array.dtype
 
+        if convert_floating:
+            if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
+                input_array.dtype
+            ):
+                from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
+
+                inferred_float_dtype = FLOAT_STR_TO_DTYPE.get(
+                    input_array.dtype.name, "Float64"
+                )
+                # if we could also convert to integer, check if all floats
+                # are actually integers
+                if convert_integer:
+                    arr = input_array[notna(input_array)]
+                    if (arr.astype(int) == arr).all():
+                        inferred_dtype = "Int64"
+                    else:
+                        inferred_dtype = inferred_float_dtype
+                else:
+                    inferred_dtype = inferred_float_dtype
+        else:
+            if is_float_dtype(inferred_dtype):
+                inferred_dtype = input_array.dtype
+
         if convert_boolean:
             if is_bool_dtype(input_array.dtype):
                 inferred_dtype = "boolean"
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6088,6 +6088,7 @@ def convert_dtypes(
         convert_string: bool_t = True,
         convert_integer: bool_t = True,
         convert_boolean: bool_t = True,
+        convert_floating: bool_t = True,
     ) -> FrameOrSeries:
         """
         Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
@@ -6104,6 +6105,12 @@ def convert_dtypes(
             Whether, if possible, conversion can be done to integer extension types.
         convert_boolean : bool, defaults True
             Whether object dtypes should be converted to ``BooleanDtypes()``.
+        convert_floating : bool, defaults True
+            Whether, if possible, conversion can be done to floating extension types.
+            If `convert_integer` is also True, preference will be give to integer
+            dtypes if the floats can be faithfully casted to integers.
+
+            .. versionadded:: 1.2.0
 
         Returns
         -------
@@ -6121,19 +6128,25 @@ def convert_dtypes(
         -----
         By default, ``convert_dtypes`` will attempt to convert a Series (or each
         Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
-        ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
-        possible to turn off individual conversions to ``StringDtype``, the integer
-        extension types or ``BooleanDtype``, respectively.
+        ``convert_string``, ``convert_integer``, ``convert_boolean`` and
+        ``convert_boolean``, it is possible to turn off individual conversions
+        to ``StringDtype``, the integer extension types, ``BooleanDtype``
+        or floating extension types, respectively.
 
         For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
         rules as during normal Series/DataFrame construction.  Then, if possible,
-        convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
-        type, otherwise leave as ``object``.
+        convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
+        or floating extension type, otherwise leave as ``object``.
 
         If the dtype is integer, convert to an appropriate integer extension type.
 
         If the dtype is numeric, and consists of all integers, convert to an
-        appropriate integer extension type.
+        appropriate integer extension type. Otherwise, convert to an
+        appropriate floating extension type.
+
+        .. versionchanged:: 1.2
+            Starting with pandas 1.2, this method also converts float columns
+            to the nullable floating extension type.
 
         In the future, as new dtypes are added that support ``pd.NA``, the results
         of this method will change to support those new dtypes.
@@ -6173,7 +6186,7 @@ def convert_dtypes(
         >>> dfn = df.convert_dtypes()
         >>> dfn
            a  b      c     d     e      f
-        0  1  x   True     h    10    NaN
+        0  1  x   True     h    10   <NA>
         1  2  y  False     i  <NA>  100.5
         2  3  z   <NA>  <NA>    20  200.0
 
@@ -6183,7 +6196,7 @@ def convert_dtypes(
         c    boolean
         d     string
         e      Int64
-        f    float64
+        f    Float64
         dtype: object
 
         Start with a Series of strings and missing data represented by ``np.nan``.
@@ -6205,12 +6218,20 @@ def convert_dtypes(
         """
         if self.ndim == 1:
             return self._convert_dtypes(
-                infer_objects, convert_string, convert_integer, convert_boolean
+                infer_objects,
+                convert_string,
+                convert_integer,
+                convert_boolean,
+                convert_floating,
             )
         else:
             results = [
                 col._convert_dtypes(
-                    infer_objects, convert_string, convert_integer, convert_boolean
+                    infer_objects,
+                    convert_string,
+                    convert_integer,
+                    convert_boolean,
+                    convert_floating,
                 )
                 for col_name, col in self.items()
             ]
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4706,16 +4706,21 @@ def _convert_dtypes(
         convert_string: bool = True,
         convert_integer: bool = True,
         convert_boolean: bool = True,
+        convert_floating: bool = True,
     ) -> "Series":
         input_series = self
         if infer_objects:
             input_series = input_series.infer_objects()
             if is_object_dtype(input_series):
                 input_series = input_series.copy()
 
-        if convert_string or convert_integer or convert_boolean:
+        if convert_string or convert_integer or convert_boolean or convert_floating:
             inferred_dtype = convert_dtypes(
-                input_series._values, convert_string, convert_integer, convert_boolean
+                input_series._values,
+                convert_string,
+                convert_integer,
+                convert_boolean,
+                convert_floating,
             )
             try:
                 result = input_series.astype(inferred_dtype)
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -58,9 +58,17 @@
         [10, np.nan, 20],
         np.dtype("float"),
         "Int64",
-        {("convert_integer", False): np.dtype("float")},
+        {
+            ("convert_integer", False, "convert_floating", True): "Float64",
+            ("convert_integer", False, "convert_floating", False): np.dtype("float"),
+        },
+    ),
+    (
+        [np.nan, 100.5, 200],
+        np.dtype("float"),
+        "Float64",
+        {("convert_floating", False): np.dtype("float")},
     ),
-    ([np.nan, 100.5, 200], np.dtype("float"), np.dtype("float"), {}),
     (
         [3, 4, 5],
         "Int8",
@@ -85,20 +93,30 @@
         "Int8",
         {("convert_integer", False): np.dtype("i1")},
     ),
+    (
+        [1.2, 1.3],
+        np.dtype("float32"),
+        "Float32",
+        {("convert_floating", False): np.dtype("float32")},
+    ),
     (
         [1, 2.0],
         object,
         "Int64",
         {
-            ("convert_integer", False): np.dtype("float"),
+            ("convert_integer", False): "Float64",
+            ("convert_integer", False, "convert_floating", False): np.dtype("float"),
             ("infer_objects", False): np.dtype("object"),
         },
     ),
     (
         [1, 2.5],
         object,
-        np.dtype("float"),
-        {("infer_objects", False): np.dtype("object")},
+        "Float64",
+        {
+            ("convert_floating", False): np.dtype("float"),
+            ("infer_objects", False): np.dtype("object"),
+        },
     ),
     (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}),
     (
@@ -134,7 +152,7 @@ class TestSeriesConvertDtypes:
         "data, maindtype, expected_default, expected_other",
         test_cases,
     )
-    @pytest.mark.parametrize("params", product(*[(True, False)] * 4))
+    @pytest.mark.parametrize("params", product(*[(True, False)] * 5))
     def test_convert_dtypes(
         self, data, maindtype, params, expected_default, expected_other
     ):
@@ -150,12 +168,13 @@ def test_convert_dtypes(
             "convert_string",
             "convert_integer",
             "convert_boolean",
+            "convert_floating",
         ]
         params_dict = dict(zip(param_names, params))
 
         expected_dtype = expected_default
-        for (key, val), dtype in expected_other.items():
-            if params_dict[key] is val:
+        for spec, dtype in expected_other.items():
+            if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
                 expected_dtype = dtype
 
         expected = pd.Series(data, dtype=expected_dtype)