Skip to content

BUG: to_numeric converting StringArray to object or float64 #52174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2325,10 +2325,14 @@ def maybe_convert_numeric(
if not seen.coerce_numeric:
raise type(err)(f"{err} at position {i}")

seen.saw_null()
floats[i] = NaN
mask[i] = 1

if allow_null_in_int:
seen.null_ = True
else:
seen.saw_null()
floats[i] = NaN

if seen.check_uint64_conflict():
return (values, None)

Expand Down
17 changes: 13 additions & 4 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
is_integer_dtype,
is_number,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import (
Expand All @@ -30,6 +30,7 @@

from pandas.core.arrays import BaseMaskedArray
from pandas.core.arrays.arrow import ArrowDtype
from pandas.core.arrays.string_ import StringDtype

if TYPE_CHECKING:
from pandas._typing import (
Expand Down Expand Up @@ -196,6 +197,8 @@ def to_numeric(
else:
values = arg

orig_values = values

# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
# save mask to reconstruct the full array after casting
mask: npt.NDArray[np.bool_] | None = None
Expand All @@ -220,17 +223,23 @@ def to_numeric(
values,
set(),
coerce_numeric=coerce_numeric,
convert_to_masked_nullable=dtype_backend is not lib.no_default,
convert_to_masked_nullable=dtype_backend is not lib.no_default
or isinstance(values_dtype, StringDtype),
)
except (ValueError, TypeError):
if errors == "raise":
raise
values = orig_values

if new_mask is not None:
# Remove unnecessary values, is expected later anyway and enables
# downcasting
values = values[~new_mask]
elif dtype_backend is not lib.no_default and new_mask is None:
elif (
dtype_backend is not lib.no_default
and new_mask is None
or isinstance(values_dtype, StringDtype)
):
new_mask = np.zeros(values.shape, dtype=np.bool_)

# attempt downcast only if the data has been successfully converted
Expand Down Expand Up @@ -265,7 +274,7 @@ def to_numeric(

# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
# masked array
if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype):
if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
if mask is None:
mask = new_mask
else:
Expand Down
30 changes: 24 additions & 6 deletions pandas/tests/tools/test_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,12 +723,12 @@ def test_precision_float_conversion(strrep):
@pytest.mark.parametrize(
"values, expected",
[
(["1", "2", None], Series([1, 2, np.nan])),
(["1", "2", "3"], Series([1, 2, 3])),
(["1", "2", 3], Series([1, 2, 3])),
(["1", "2", 3.5], Series([1, 2, 3.5])),
(["1", None, 3.5], Series([1, np.nan, 3.5])),
(["1", "2", "3.5"], Series([1, 2, 3.5])),
(["1", "2", None], Series([1, 2, np.nan], dtype="Int64")),
(["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
(["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
(["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
(["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")),
(["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
],
)
def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
Expand All @@ -738,6 +738,24 @@ def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected
tm.assert_series_equal(result, expected)


def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
# GH#52146
values = ["a", "1"]
ser = Series(values, dtype=nullable_string_dtype)
result = to_numeric(ser, errors="coerce")
expected = Series([pd.NA, 1], dtype="Int64")
tm.assert_series_equal(result, expected)


def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype):
# GH#52146
values = ["a", "1"]
ser = Series(values, dtype=nullable_string_dtype)
expected = ser.copy()
result = to_numeric(ser, errors="ignore")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"data, input_dtype, downcast, expected_dtype",
(
Expand Down