Skip to content

Commit 94a8b58

Browse files
committed
String.astype() uses _from_sequence_of_strings
1 parent 3777066 commit 94a8b58

File tree

6 files changed

+50
-29
lines changed

6 files changed

+50
-29
lines changed

pandas/core/arrays/categorical.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,12 @@ def _constructor(self) -> Type[Categorical]:
478478
def _from_sequence(cls, scalars, *, dtype: Optional[Dtype] = None, copy=False):
479479
return Categorical(scalars, dtype=dtype, copy=copy)
480480

481+
@classmethod
482+
def _from_sequence_of_strings(
483+
cls, strings, *, dtype: Optional[Dtype] = None, copy=False
484+
):
485+
return cls._from_sequence(scalars=strings, dtype=dtype, copy=copy)
486+
481487
def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
482488
"""
483489
Coerce this type to another dtype

pandas/core/arrays/datetimes.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
to_offset,
3939
tzconversion,
4040
)
41+
from pandas._typing import Dtype
4142
from pandas.errors import PerformanceWarning
4243

4344
from pandas.core.dtypes.cast import astype_dt64_to_dt64tz
@@ -65,6 +66,7 @@
6566
from pandas.core.dtypes.missing import isna
6667

6768
from pandas.core.algorithms import checked_add_with_arr
69+
from pandas.core.api import NA
6870
from pandas.core.arrays import (
6971
ExtensionArray,
7072
datetimelike as dtl,
@@ -334,6 +336,13 @@ def _simple_new(
334336
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
335337
return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)
336338

339+
@classmethod
340+
def _from_sequence_of_strings(
341+
cls, strings, *, dtype: Optional[Dtype] = None, copy=False
342+
):
343+
scalars = [NaT if s is NA else s for s in strings]
344+
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
345+
337346
@classmethod
338347
def _from_sequence_not_strict(
339348
cls,

pandas/core/arrays/period.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
)
7474

7575
import pandas.core.algorithms as algos
76+
from pandas.core.api import NA
7677
from pandas.core.arrays import datetimelike as dtl
7778
import pandas.core.common as com
7879

@@ -252,7 +253,8 @@ def _from_sequence(
252253
def _from_sequence_of_strings(
253254
cls, strings, *, dtype: Optional[Dtype] = None, copy=False
254255
) -> PeriodArray:
255-
return cls._from_sequence(strings, dtype=dtype, copy=copy)
256+
scalars = [NaT if s is NA else s for s in strings]
257+
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
256258

257259
@classmethod
258260
def _from_datetime64(cls, data, freq, tz=None) -> PeriodArray:

pandas/core/arrays/string_.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,29 @@
2626
from pandas.core.dtypes.common import (
2727
is_array_like,
2828
is_bool_dtype,
29+
is_datetime64_any_dtype,
2930
is_dtype_equal,
3031
is_extension_array_dtype,
3132
is_integer_dtype,
3233
is_object_dtype,
3334
is_string_dtype,
35+
is_timedelta64_dtype,
3436
pandas_dtype,
3537
)
3638

3739
from pandas.core import ops
3840
from pandas.core.array_algos import masked_reductions
39-
from pandas.core.arrays import (
41+
from pandas.core.arrays import PandasArray
42+
from pandas.core.arrays.datetimes import DatetimeArray
43+
from pandas.core.arrays.floating import (
4044
FloatingArray,
45+
FloatingDtype,
46+
)
47+
from pandas.core.arrays.integer import (
4148
IntegerArray,
42-
PandasArray,
49+
_IntegerDtype,
4350
)
44-
from pandas.core.arrays.floating import FloatingDtype
45-
from pandas.core.arrays.integer import _IntegerDtype
51+
from pandas.core.arrays.timedeltas import TimedeltaArray
4652
from pandas.core.construction import extract_array
4753
from pandas.core.indexers import check_array_indexer
4854
from pandas.core.missing import isna
@@ -329,7 +335,13 @@ def astype(self, dtype, copy=True):
329335
return FloatingArray(values, mask, copy=False)
330336
elif is_extension_array_dtype(dtype):
331337
cls = dtype.construct_array_type()
332-
return cls._from_sequence(self, dtype=dtype, copy=copy)
338+
return cls._from_sequence_of_strings(self, dtype=dtype, copy=copy)
339+
elif is_datetime64_any_dtype(dtype):
340+
return DatetimeArray._from_sequence_of_strings(self, dtype=dtype, copy=copy)
341+
elif is_timedelta64_dtype(dtype):
342+
return TimedeltaArray._from_sequence_of_strings(
343+
self, dtype=dtype, copy=copy
344+
)
333345
elif np.issubdtype(dtype, np.floating):
334346
arr = self._ndarray.copy()
335347
mask = self.isna()

pandas/core/arrays/timedeltas.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@
3535
ints_to_pytimedelta,
3636
parse_timedelta_unit,
3737
)
38-
from pandas._typing import NpDtype
38+
from pandas._typing import (
39+
Dtype,
40+
NpDtype,
41+
)
3942
from pandas.compat.numpy import function as nv
4043

4144
from pandas.core.dtypes.cast import astype_td64_unit_conversion
@@ -60,6 +63,7 @@
6063

6164
from pandas.core import nanops
6265
from pandas.core.algorithms import checked_add_with_arr
66+
from pandas.core.api import NA
6367
from pandas.core.arrays import (
6468
ExtensionArray,
6569
IntegerArray,
@@ -254,6 +258,13 @@ def _from_sequence(
254258

255259
return cls._simple_new(data, freq=freq)
256260

261+
@classmethod
262+
def _from_sequence_of_strings(
263+
cls, strings, *, dtype: Optional[Dtype] = None, copy=False
264+
):
265+
scalars = [NaT if s is NA else s for s in strings]
266+
return cls._from_sequence(scalars, dtype=dtype, copy=copy)
267+
257268
@classmethod
258269
def _from_sequence_not_strict(
259270
cls,

pandas/tests/series/methods/test_astype.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ class TestAstypeString:
354354
@pytest.mark.parametrize(
355355
"data, dtype",
356356
[
357+
([True, False, NA], "boolean"),
358+
# GH-40351
357359
(["A", NA], "category"),
358360
(["2020-10-10", "2020-10-10"], "datetime64[ns]"),
359361
(["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"),
@@ -365,19 +367,10 @@ class TestAstypeString:
365367
(["1/1/2021", "2/1/2021"], "period[M]"),
366368
(["1/1/2021", "2/1/2021", NaT], "period[M]"),
367369
(["1 Day", "59 Days", NaT], "timedelta64[ns]"),
368-
# currently no way to parse BooleanArray, IntervalArray from a
369-
# list of strings
370+
# currently no way to parse IntervalArray from strings
370371
],
371372
)
372373
def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request):
373-
if dtype in ("timedelta64[ns]"):
374-
mark = pytest.mark.xfail(reason="TODO fix is_extension_array_dtype GH40478")
375-
request.node.add_marker(mark)
376-
if NaT in data and dtype in ("period[M]", "datetime64[ns]"):
377-
mark = pytest.mark.xfail(
378-
reason="TODO StringArray.astype() None to dtype.na_value conversion"
379-
)
380-
request.node.add_marker(mark)
381374
# GH-40351
382375
s = Series(data, dtype=dtype)
383376
tm.assert_series_equal(s, s.astype("string").astype(dtype))
@@ -503,18 +496,6 @@ def test_astype_categories_raises(self):
503496
with pytest.raises(TypeError, match="got an unexpected"):
504497
s.astype("category", categories=["a", "b"], ordered=True)
505498

506-
def test_astype_str_to_extension_dtype(self):
507-
# GH-40351
508-
s = Series(["A", np.NaN], dtype="string")
509-
result = s.astype("category")
510-
expected = Series(["A", np.NaN], dtype="category")
511-
tm.assert_series_equal(result, expected)
512-
513-
s = Series(["1/1/2021", "2/1/2021"], dtype="string")
514-
result = s.astype("period[M]")
515-
expected = Series(["1/1/2021", "2/1/2021"], dtype="period[M]")
516-
tm.assert_series_equal(result, expected)
517-
518499
@pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]])
519500
def test_astype_from_categorical(self, items):
520501
ser = Series(items)

0 commit comments

Comments
 (0)