Skip to content

Commit 4a35f2d

Browse files
ENH: include conversion to nullable float in convert_dtypes() (#38117)
1 parent 47d0da6 commit 4a35f2d

File tree

4 files changed

+96
-21
lines changed

4 files changed

+96
-21
lines changed

pandas/core/dtypes/cast.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1196,6 +1196,7 @@ def convert_dtypes(
11961196
convert_string: bool = True,
11971197
convert_integer: bool = True,
11981198
convert_boolean: bool = True,
1199+
convert_floating: bool = True,
11991200
) -> Dtype:
12001201
"""
12011202
Convert objects to best possible type, and optionally,
@@ -1210,14 +1211,20 @@ def convert_dtypes(
12101211
Whether, if possible, conversion can be done to integer extension types.
12111212
convert_boolean : bool, defaults True
12121213
Whether object dtypes should be converted to ``BooleanDtypes()``.
1214+
convert_floating : bool, defaults True
1215+
Whether, if possible, conversion can be done to floating extension types.
1216+
If `convert_integer` is also True, preference will be give to integer
1217+
dtypes if the floats can be faithfully casted to integers.
12131218
12141219
Returns
12151220
-------
12161221
dtype
12171222
new dtype
12181223
"""
12191224
is_extension = is_extension_array_dtype(input_array.dtype)
1220-
if (convert_string or convert_integer or convert_boolean) and not is_extension:
1225+
if (
1226+
convert_string or convert_integer or convert_boolean or convert_floating
1227+
) and not is_extension:
12211228
try:
12221229
inferred_dtype = lib.infer_dtype(input_array)
12231230
except ValueError:
@@ -1245,6 +1252,29 @@ def convert_dtypes(
12451252
if is_integer_dtype(inferred_dtype):
12461253
inferred_dtype = input_array.dtype
12471254

1255+
if convert_floating:
1256+
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
1257+
input_array.dtype
1258+
):
1259+
from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
1260+
1261+
inferred_float_dtype = FLOAT_STR_TO_DTYPE.get(
1262+
input_array.dtype.name, "Float64"
1263+
)
1264+
# if we could also convert to integer, check if all floats
1265+
# are actually integers
1266+
if convert_integer:
1267+
arr = input_array[notna(input_array)]
1268+
if (arr.astype(int) == arr).all():
1269+
inferred_dtype = "Int64"
1270+
else:
1271+
inferred_dtype = inferred_float_dtype
1272+
else:
1273+
inferred_dtype = inferred_float_dtype
1274+
else:
1275+
if is_float_dtype(inferred_dtype):
1276+
inferred_dtype = input_array.dtype
1277+
12481278
if convert_boolean:
12491279
if is_bool_dtype(input_array.dtype):
12501280
inferred_dtype = "boolean"

pandas/core/generic.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6088,6 +6088,7 @@ def convert_dtypes(
60886088
convert_string: bool_t = True,
60896089
convert_integer: bool_t = True,
60906090
convert_boolean: bool_t = True,
6091+
convert_floating: bool_t = True,
60916092
) -> FrameOrSeries:
60926093
"""
60936094
Convert columns to best possible dtypes using dtypes supporting ``pd.NA``.
@@ -6104,6 +6105,12 @@ def convert_dtypes(
61046105
Whether, if possible, conversion can be done to integer extension types.
61056106
convert_boolean : bool, defaults True
61066107
Whether object dtypes should be converted to ``BooleanDtypes()``.
6108+
convert_floating : bool, defaults True
6109+
Whether, if possible, conversion can be done to floating extension types.
6110+
If `convert_integer` is also True, preference will be give to integer
6111+
dtypes if the floats can be faithfully casted to integers.
6112+
6113+
.. versionadded:: 1.2.0
61076114
61086115
Returns
61096116
-------
@@ -6121,19 +6128,25 @@ def convert_dtypes(
61216128
-----
61226129
By default, ``convert_dtypes`` will attempt to convert a Series (or each
61236130
Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
6124-
``convert_string``, ``convert_integer``, and ``convert_boolean``, it is
6125-
possible to turn off individual conversions to ``StringDtype``, the integer
6126-
extension types or ``BooleanDtype``, respectively.
6131+
``convert_string``, ``convert_integer``, ``convert_boolean`` and
6132+
``convert_boolean``, it is possible to turn off individual conversions
6133+
to ``StringDtype``, the integer extension types, ``BooleanDtype``
6134+
or floating extension types, respectively.
61276135
61286136
For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
61296137
rules as during normal Series/DataFrame construction. Then, if possible,
6130-
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension
6131-
type, otherwise leave as ``object``.
6138+
convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
6139+
or floating extension type, otherwise leave as ``object``.
61326140
61336141
If the dtype is integer, convert to an appropriate integer extension type.
61346142
61356143
If the dtype is numeric, and consists of all integers, convert to an
6136-
appropriate integer extension type.
6144+
appropriate integer extension type. Otherwise, convert to an
6145+
appropriate floating extension type.
6146+
6147+
.. versionchanged:: 1.2
6148+
Starting with pandas 1.2, this method also converts float columns
6149+
to the nullable floating extension type.
61376150
61386151
In the future, as new dtypes are added that support ``pd.NA``, the results
61396152
of this method will change to support those new dtypes.
@@ -6173,7 +6186,7 @@ def convert_dtypes(
61736186
>>> dfn = df.convert_dtypes()
61746187
>>> dfn
61756188
a b c d e f
6176-
0 1 x True h 10 NaN
6189+
0 1 x True h 10 <NA>
61776190
1 2 y False i <NA> 100.5
61786191
2 3 z <NA> <NA> 20 200.0
61796192
@@ -6183,7 +6196,7 @@ def convert_dtypes(
61836196
c boolean
61846197
d string
61856198
e Int64
6186-
f float64
6199+
f Float64
61876200
dtype: object
61886201
61896202
Start with a Series of strings and missing data represented by ``np.nan``.
@@ -6205,12 +6218,20 @@ def convert_dtypes(
62056218
"""
62066219
if self.ndim == 1:
62076220
return self._convert_dtypes(
6208-
infer_objects, convert_string, convert_integer, convert_boolean
6221+
infer_objects,
6222+
convert_string,
6223+
convert_integer,
6224+
convert_boolean,
6225+
convert_floating,
62096226
)
62106227
else:
62116228
results = [
62126229
col._convert_dtypes(
6213-
infer_objects, convert_string, convert_integer, convert_boolean
6230+
infer_objects,
6231+
convert_string,
6232+
convert_integer,
6233+
convert_boolean,
6234+
convert_floating,
62146235
)
62156236
for col_name, col in self.items()
62166237
]

pandas/core/series.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4706,16 +4706,21 @@ def _convert_dtypes(
47064706
convert_string: bool = True,
47074707
convert_integer: bool = True,
47084708
convert_boolean: bool = True,
4709+
convert_floating: bool = True,
47094710
) -> "Series":
47104711
input_series = self
47114712
if infer_objects:
47124713
input_series = input_series.infer_objects()
47134714
if is_object_dtype(input_series):
47144715
input_series = input_series.copy()
47154716

4716-
if convert_string or convert_integer or convert_boolean:
4717+
if convert_string or convert_integer or convert_boolean or convert_floating:
47174718
inferred_dtype = convert_dtypes(
4718-
input_series._values, convert_string, convert_integer, convert_boolean
4719+
input_series._values,
4720+
convert_string,
4721+
convert_integer,
4722+
convert_boolean,
4723+
convert_floating,
47194724
)
47204725
try:
47214726
result = input_series.astype(inferred_dtype)

pandas/tests/series/methods/test_convert_dtypes.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,17 @@
5858
[10, np.nan, 20],
5959
np.dtype("float"),
6060
"Int64",
61-
{("convert_integer", False): np.dtype("float")},
61+
{
62+
("convert_integer", False, "convert_floating", True): "Float64",
63+
("convert_integer", False, "convert_floating", False): np.dtype("float"),
64+
},
65+
),
66+
(
67+
[np.nan, 100.5, 200],
68+
np.dtype("float"),
69+
"Float64",
70+
{("convert_floating", False): np.dtype("float")},
6271
),
63-
([np.nan, 100.5, 200], np.dtype("float"), np.dtype("float"), {}),
6472
(
6573
[3, 4, 5],
6674
"Int8",
@@ -85,20 +93,30 @@
8593
"Int8",
8694
{("convert_integer", False): np.dtype("i1")},
8795
),
96+
(
97+
[1.2, 1.3],
98+
np.dtype("float32"),
99+
"Float32",
100+
{("convert_floating", False): np.dtype("float32")},
101+
),
88102
(
89103
[1, 2.0],
90104
object,
91105
"Int64",
92106
{
93-
("convert_integer", False): np.dtype("float"),
107+
("convert_integer", False): "Float64",
108+
("convert_integer", False, "convert_floating", False): np.dtype("float"),
94109
("infer_objects", False): np.dtype("object"),
95110
},
96111
),
97112
(
98113
[1, 2.5],
99114
object,
100-
np.dtype("float"),
101-
{("infer_objects", False): np.dtype("object")},
115+
"Float64",
116+
{
117+
("convert_floating", False): np.dtype("float"),
118+
("infer_objects", False): np.dtype("object"),
119+
},
102120
),
103121
(["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}),
104122
(
@@ -134,7 +152,7 @@ class TestSeriesConvertDtypes:
134152
"data, maindtype, expected_default, expected_other",
135153
test_cases,
136154
)
137-
@pytest.mark.parametrize("params", product(*[(True, False)] * 4))
155+
@pytest.mark.parametrize("params", product(*[(True, False)] * 5))
138156
def test_convert_dtypes(
139157
self, data, maindtype, params, expected_default, expected_other
140158
):
@@ -150,12 +168,13 @@ def test_convert_dtypes(
150168
"convert_string",
151169
"convert_integer",
152170
"convert_boolean",
171+
"convert_floating",
153172
]
154173
params_dict = dict(zip(param_names, params))
155174

156175
expected_dtype = expected_default
157-
for (key, val), dtype in expected_other.items():
158-
if params_dict[key] is val:
176+
for spec, dtype in expected_other.items():
177+
if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
159178
expected_dtype = dtype
160179

161180
expected = pd.Series(data, dtype=expected_dtype)

0 commit comments

Comments
 (0)