Skip to content

Backport PR #52174 on branch 2.0.x (BUG: to_numeric converting StringArray to object or float64) #52193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2321,10 +2321,14 @@ def maybe_convert_numeric(
if not seen.coerce_numeric:
raise type(err)(f"{err} at position {i}")

seen.saw_null()
floats[i] = NaN
mask[i] = 1

if allow_null_in_int:
seen.null_ = True
else:
seen.saw_null()
floats[i] = NaN

if seen.check_uint64_conflict():
return (values, None)

Expand Down
17 changes: 13 additions & 4 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
is_integer_dtype,
is_number,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import (
Expand All @@ -32,6 +32,7 @@

import pandas as pd
from pandas.core.arrays import BaseMaskedArray
from pandas.core.arrays.string_ import StringDtype


def to_numeric(
Expand Down Expand Up @@ -191,6 +192,8 @@ def to_numeric(
else:
values = arg

orig_values = values

# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
# save mask to reconstruct the full array after casting
mask: npt.NDArray[np.bool_] | None = None
Expand All @@ -215,17 +218,23 @@ def to_numeric(
values,
set(),
coerce_numeric=coerce_numeric,
convert_to_masked_nullable=dtype_backend is not lib.no_default,
convert_to_masked_nullable=dtype_backend is not lib.no_default
or isinstance(values_dtype, StringDtype),
)
except (ValueError, TypeError):
if errors == "raise":
raise
values = orig_values

if new_mask is not None:
# Remove unnecessary values, is expected later anyway and enables
# downcasting
values = values[~new_mask]
elif dtype_backend is not lib.no_default and new_mask is None:
elif (
dtype_backend is not lib.no_default
and new_mask is None
or isinstance(values_dtype, StringDtype)
):
new_mask = np.zeros(values.shape, dtype=np.bool_)

# attempt downcast only if the data has been successfully converted
Expand Down Expand Up @@ -260,7 +269,7 @@ def to_numeric(

# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
# masked array
if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype):
if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
if mask is None:
mask = new_mask
else:
Expand Down
30 changes: 24 additions & 6 deletions pandas/tests/tools/test_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,12 +723,12 @@ def test_precision_float_conversion(strrep):
@pytest.mark.parametrize(
"values, expected",
[
(["1", "2", None], Series([1, 2, np.nan])),
(["1", "2", "3"], Series([1, 2, 3])),
(["1", "2", 3], Series([1, 2, 3])),
(["1", "2", 3.5], Series([1, 2, 3.5])),
(["1", None, 3.5], Series([1, np.nan, 3.5])),
(["1", "2", "3.5"], Series([1, 2, 3.5])),
(["1", "2", None], Series([1, 2, np.nan], dtype="Int64")),
(["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
(["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
(["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
(["1", None, 3.5], Series([1, np.nan, 3.5], dtype="Float64")),
(["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
],
)
def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
Expand All @@ -738,6 +738,24 @@ def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected
tm.assert_series_equal(result, expected)


def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
# GH#52146
values = ["a", "1"]
ser = Series(values, dtype=nullable_string_dtype)
result = to_numeric(ser, errors="coerce")
expected = Series([pd.NA, 1], dtype="Int64")
tm.assert_series_equal(result, expected)


def test_to_numeric_from_nullable_string_ignore(nullable_string_dtype):
# GH#52146
values = ["a", "1"]
ser = Series(values, dtype=nullable_string_dtype)
expected = ser.copy()
result = to_numeric(ser, errors="ignore")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"data, input_dtype, downcast, expected_dtype",
(
Expand Down