Skip to content

Commit e785998

Browse files
authored
BUG: to_numeric converting StringArray to object or float64 (#52174)
* BUG: to_numeric converting StringArray to object or float64 * BUG: to_numeric converting StringArray to object or float64 * Update comment
1 parent ab76365 commit e785998

File tree

3 files changed

+43
-12
lines changed

3 files changed

+43
-12
lines changed

pandas/_libs/lib.pyx

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2326,10 +2326,14 @@ def maybe_convert_numeric(
23262326
if not seen.coerce_numeric:
23272327
raise type(err)(f"{err} at position {i}")
23282328

2329-
seen.saw_null()
2330-
floats[i] = NaN
23312329
mask[i] = 1
23322330

2331+
if allow_null_in_int:
2332+
seen.null_ = True
2333+
else:
2334+
seen.saw_null()
2335+
floats[i] = NaN
2336+
23332337
if seen.check_uint64_conflict():
23342338
return (values, None)
23352339

pandas/core/tools/numeric.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
is_integer_dtype,
2020
is_number,
2121
is_numeric_dtype,
22-
is_object_dtype,
2322
is_scalar,
23+
is_string_dtype,
2424
needs_i8_conversion,
2525
)
2626
from pandas.core.dtypes.generic import (
@@ -30,6 +30,7 @@
3030

3131
from pandas.core.arrays import BaseMaskedArray
3232
from pandas.core.arrays.arrow import ArrowDtype
33+
from pandas.core.arrays.string_ import StringDtype
3334

3435
if TYPE_CHECKING:
3536
from pandas._typing import (
@@ -196,6 +197,8 @@ def to_numeric(
196197
else:
197198
values = arg
198199

200+
orig_values = values
201+
199202
# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
200203
# save mask to reconstruct the full array after casting
201204
mask: npt.NDArray[np.bool_] | None = None
@@ -220,17 +223,23 @@ def to_numeric(
220223
values,
221224
set(),
222225
coerce_numeric=coerce_numeric,
223-
convert_to_masked_nullable=dtype_backend is not lib.no_default,
226+
convert_to_masked_nullable=dtype_backend is not lib.no_default
227+
or isinstance(values_dtype, StringDtype),
224228
)
225229
except (ValueError, TypeError):
226230
if errors == "raise":
227231
raise
232+
values = orig_values
228233

229234
if new_mask is not None:
230235
# Remove unnecessary values, is expected later anyway and enables
231236
# downcasting
232237
values = values[~new_mask]
233-
elif dtype_backend is not lib.no_default and new_mask is None:
238+
elif (
239+
dtype_backend is not lib.no_default
240+
and new_mask is None
241+
or isinstance(values_dtype, StringDtype)
242+
):
234243
new_mask = np.zeros(values.shape, dtype=np.bool_)
235244

236245
# attempt downcast only if the data has been successfully converted
@@ -265,7 +274,7 @@ def to_numeric(
265274

266275
# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
267276
# masked array
268-
if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype):
277+
if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
269278
if mask is None:
270279
mask = new_mask
271280
else:

pandas/tests/tools/test_to_numeric.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -723,12 +723,12 @@ def test_precision_float_conversion(strrep):
723723
@pytest.mark.parametrize(
724724
"values, expected",
725725
[
726-
(["1", "2", None], Series([1, 2, np.nan])),
727-
(["1", "2", "3"], Series([1, 2, 3])),
728-
(["1", "2", 3], Series([1, 2, 3])),
729-
(["1", "2", 3.5], Series([1, 2, 3.5])),
730-
(["1", None, 3.5], Series([1, np.nan, 3.5])),
731-
(["1", "2", "3.5"], Series([1, 2, 3.5])),
726+
(["1", "2", None], Series([1, 2, np.nan], dtype="Int64")),
727+
(["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
728+
(["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
729+
(["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
730+
(["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")),
731+
(["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
732732
],
733733
)
734734
def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
@@ -738,6 +738,24 @@ def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected
738738
tm.assert_series_equal(result, expected)
739739

740740

741+
def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
742+
# GH#52146
743+
values = ["a", "1"]
744+
ser = Series(values, dtype=nullable_string_dtype)
745+
result = to_numeric(ser, errors="coerce")
746+
expected = Series([pd.NA, 1], dtype="Int64")
747+
tm.assert_series_equal(result, expected)
748+
749+
750+
def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype):
751+
# GH#52146
752+
values = ["a", "1"]
753+
ser = Series(values, dtype=nullable_string_dtype)
754+
expected = ser.copy()
755+
result = to_numeric(ser, errors="ignore")
756+
tm.assert_series_equal(result, expected)
757+
758+
741759
@pytest.mark.parametrize(
742760
"data, input_dtype, downcast, expected_dtype",
743761
(

0 commit comments

Comments
 (0)