added pyarrow/numpy dtype literals and allowed str | DtypeObj as input for Series.astype (#756)

randolf-scholz · web-flow · commit 490914f32ee0 · 2023-07-25T11:49:58.000-04:00
* added pyarrow/numpy dtype literals &amp; allowed str as astype input

* removed accidental double float

* added ObjectDtypeArg and lots of unit tests for literals

* removed str overload

* re-enabled s.astype(s.dtype) test

* refactored astype-tests to use pytest.mark.parametrize

* added VoidDtype, fixed some test issues

* attempted fix for float96/complex192

* added coded for testing that all types are tested

* small edit

* removed float96, complex192 and fixed integer tests

* reverted accidental Series renames

* removed windows check for test_astype_int

* reordered literals
diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi
@@ -78,6 +78,17 @@ NpDtype: TypeAlias = str | np.dtype[np.generic] | type[str | complex | bool | ob
 Dtype: TypeAlias = ExtensionDtype | NpDtype
 DtypeArg: TypeAlias = Dtype | Mapping[Any, Dtype]
 DtypeBackend: TypeAlias = Literal["pyarrow", "numpy_nullable"]
+
+# NOTE: we want to catch all the possible dtypes from np.sctypeDict
+# timedelta64
+# M
+# m8
+# M8
+# object_
+# object0
+# m
+# datetime64
+
 BooleanDtypeArg: TypeAlias = (
     # Builtin bool type and its string alias
     type[bool]  # noqa: Y030
@@ -86,7 +97,11 @@ BooleanDtypeArg: TypeAlias = (
     | pd.BooleanDtype
     | Literal["boolean"]
     # Numpy bool type
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_
     | type[np.bool_]
+    | Literal["?", "b1", "bool8", "bool_"]
+    # PyArrow boolean type and its string alias
+    | Literal["bool[pyarrow]", "boolean[pyarrow]"]
 )
 IntDtypeArg: TypeAlias = (
     # Builtin integer type and its string alias
@@ -99,31 +114,56 @@ IntDtypeArg: TypeAlias = (
     | pd.Int64Dtype
     | Literal["Int8", "Int16", "Int32", "Int64"]
     # Numpy signed integer types and their string aliases
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.byte
     | type[np.byte]
-    | type[np.int8]
-    | type[np.int16]
-    | type[np.int32]
-    | type[np.int64]
-    | type[np.intp]
-    | Literal["byte", "int8", "int16", "int32", "int64", "intp"]
+    | Literal["b", "i1", "int8", "byte"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.short
+    | type[np.short]
+    | Literal["h", "i2", "int16", "short"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.intc
+    | type[np.intc]
+    | Literal["i", "i4", "int32", "intc"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.int_
+    | type[np.int_]
+    | Literal["l", "i8", "int64", "int_", "long"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.longlong
+    | type[np.longlong]
+    | Literal["q", "longlong"]  # NOTE: int128 not assigned
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.intp
+    | type[np.intp]  # signed pointer (=`intptr_t`, platform dependent)
+    | Literal["p", "intp", "int0"]
+    # PyArrow integer types and their string aliases
+    | Literal["int8[pyarrow]", "int16[pyarrow]", "int32[pyarrow]", "int64[pyarrow]"]
+)
+UIntDtypeArg: TypeAlias = (
+    # Pandas nullable unsigned integer types and their string aliases
+    pd.UInt8Dtype  # noqa: Y030
+    | pd.UInt16Dtype
+    | pd.UInt32Dtype
+    | pd.UInt64Dtype
+    | Literal["UInt8", "UInt16", "UInt32", "UInt64"]
     # Numpy unsigned integer types and their string aliases
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ubyte
     | type[np.ubyte]
-    | type[np.uint8]
-    | type[np.uint16]
-    | type[np.uint32]
-    | type[np.uint64]
-    | type[np.uintp]
-    | Literal["ubyte", "uint8", "uint16", "uint32", "uint64", "uintp"]
+    | Literal["B", "u1", "uint8", "ubyte"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
+    | type[np.ushort]
+    | Literal["H", "u2", "uint16", "ushort"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintc
+    | type[np.uintc]
+    | Literal["I", "u4", "uint32", "uintc"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uint
+    | type[np.uint]
+    | Literal["L", "u8", "uint", "ulong", "uint64"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ulonglong
+    | type[np.ulonglong]
+    | Literal["Q", "ulonglong"]  # NOTE: uint128 not assigned
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintp
+    | type[np.uintp]  # unsigned pointer (=`uintptr_t`, platform dependent)
+    | Literal["P", "uintp", "uint0"]
+    # PyArrow unsigned integer types and their string aliases
+    | Literal["uint8[pyarrow]", "uint16[pyarrow]", "uint32[pyarrow]", "uint64[pyarrow]"]
 )
-StrDtypeArg: TypeAlias = (
-    # Builtin str type and its string alias
-    type[str]  # noqa: Y030
-    | Literal["str"]
-    # Pandas nullable string type and its string alias
-    | pd.StringDtype
-    | Literal["string"]
-)
-BytesDtypeArg: TypeAlias = type[bytes]
 FloatDtypeArg: TypeAlias = (
     # Builtin float type and its string alias
     type[float]  # noqa: Y030
@@ -133,19 +173,50 @@ FloatDtypeArg: TypeAlias = (
     | pd.Float64Dtype
     | Literal["Float32", "Float64"]
     # Numpy float types and their string aliases
-    | type[np.float16]
-    | type[np.float32]
-    | type[np.float64]
-    | Literal["float16", "float32", "float64"]
+    # NOTE: Alias np.float16 only on Linux x86_64, use np.half instead
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.half
+    | type[np.half]
+    | Literal["e", "f2", "<f2", "float16", "half"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.single
+    | type[np.single]
+    | Literal["f", "f4", "float32", "single"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.double
+    | type[np.double]
+    | Literal["d", "f8", "float64", "double", "float_"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.longdouble
+    | type[np.longdouble]
+    | Literal["g", "f16", "float128", "longdouble", "longfloat"]
+    # PyArrow floating point types and their string aliases
+    | Literal[
+        "float[pyarrow]",
+        "double[pyarrow]",
+        "float16[pyarrow]",
+        "float32[pyarrow]",
+        "float64[pyarrow]",
+    ]
 )
 ComplexDtypeArg: TypeAlias = (
     # Builtin complex type and its string alias
     type[complex]  # noqa: Y030
     | Literal["complex"]
     # Numpy complex types and their aliases
-    | type[np.complex64]
-    | type[np.complex128]
-    | Literal["complex64", "complex128"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.csingle
+    | type[np.csingle]
+    | Literal["F", "c8", "complex64", "csingle", "singlecomplex"]
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.cdouble
+    | type[np.cdouble]
+    | Literal["D", "c16", "complex128", "cdouble", "cfloat", "complex_"]
+    #  https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.clongdouble
+    # NOTE: Alias np.complex256 only on Linux x86_64, use np.clongdouble instead
+    | type[np.clongdouble]
+    | Literal[
+        "G",
+        "c32",
+        "complex256",
+        "clongdouble",
+        "clongfloat",
+        "longcomplex",
+    ]
 )
 # Refer to https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units
 TimedeltaDtypeArg: TypeAlias = Literal[
@@ -163,6 +234,41 @@ TimedeltaDtypeArg: TypeAlias = Literal[
     "timedelta64[ps]",
     "timedelta64[fs]",
     "timedelta64[as]",
+    # numpy type codes
+    "m8[Y]",
+    "m8[M]",
+    "m8[W]",
+    "m8[D]",
+    "m8[h]",
+    "m8[m]",
+    "m8[s]",
+    "m8[ms]",
+    "m8[us]",
+    "m8[μs]",
+    "m8[ns]",
+    "m8[ps]",
+    "m8[fs]",
+    "m8[as]",
+    # little endian
+    "<m8[Y]",
+    "<m8[M]",
+    "<m8[W]",
+    "<m8[D]",
+    "<m8[h]",
+    "<m8[m]",
+    "<m8[s]",
+    "<m8[ms]",
+    "<m8[us]",
+    "<m8[μs]",
+    "<m8[ns]",
+    "<m8[ps]",
+    "<m8[fs]",
+    "<m8[as]",
+    # PyArrow duration type and its string alias
+    "duration[s][pyarrow]",
+    "duration[ms][pyarrow]",
+    "duration[us][pyarrow]",
+    "duration[ns][pyarrow]",
 ]
 TimestampDtypeArg: TypeAlias = Literal[
     "datetime64[Y]",
@@ -179,24 +285,107 @@ TimestampDtypeArg: TypeAlias = Literal[
     "datetime64[ps]",
     "datetime64[fs]",
     "datetime64[as]",
+    # numpy type codes
+    "M8[Y]",
+    "M8[M]",
+    "M8[W]",
+    "M8[D]",
+    "M8[h]",
+    "M8[m]",
+    "M8[s]",
+    "M8[ms]",
+    "M8[us]",
+    "M8[μs]",
+    "M8[ns]",
+    "M8[ps]",
+    "M8[fs]",
+    "M8[as]",
+    # little endian
+    "<M8[Y]",
+    "<M8[M]",
+    "<M8[W]",
+    "<M8[D]",
+    "<M8[h]",
+    "<M8[m]",
+    "<M8[s]",
+    "<M8[ms]",
+    "<M8[us]",
+    "<M8[μs]",
+    "<M8[ns]",
+    "<M8[ps]",
+    "<M8[fs]",
+    "<M8[as]",
+    # PyArrow timestamp type and its string alias
+    "date32[pyarrow]",
+    "date64[pyarrow]",
+    "timestamp[s][pyarrow]",
+    "timestamp[ms][pyarrow]",
+    "timestamp[us][pyarrow]",
+    "timestamp[ns][pyarrow]",
 ]
+
+StrDtypeArg: TypeAlias = (
+    # Builtin str type and its string alias
+    type[str]  # noqa: Y030
+    | Literal["str"]
+    # Pandas nullable string type and its string alias
+    | pd.StringDtype
+    | Literal["string"]
+    # Numpy string type and its string alias
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.str_
+    | type[np.str_]
+    | Literal["U", "str_", "str0", "unicode", "unicode_"]
+    # PyArrow string type and its string alias
+    | Literal["string[pyarrow]"]
+)
+BytesDtypeArg: TypeAlias = (
+    # Builtin bytes type and its string alias
+    type[bytes]  # noqa: Y030
+    | Literal["bytes"]
+    # Numpy bytes type and its string alias
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bytes_
+    | type[np.bytes_]
+    | Literal["S", "a", "bytes_", "bytes0", "string_"]
+    # PyArrow binary type and its string alias
+    | Literal["binary[pyarrow]"]
+)
 CategoryDtypeArg: TypeAlias = CategoricalDtype | Literal["category"]
 
+ObjectDtypeArg: TypeAlias = (
+    # Builtin object type and its string alias
+    type[object]  # noqa: Y030
+    | Literal["object"]
+    # Numpy object type and its string alias
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.object_
+    | type[np.object_]
+    | Literal["O"]  # NOTE: "object_" not assigned
+)
+
+VoidDtypeArg: TypeAlias = (
+    # Numpy void type and its string alias
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.void
+    type[np.void]
+    | Literal["V", "void", "void0"]
+)
+
+# DtypeArg specifies all allowable dtypes in a functions its dtype argument
+DtypeObj: TypeAlias = np.dtype[np.generic] | ExtensionDtype
+
 AstypeArg: TypeAlias = (
     BooleanDtypeArg
     | IntDtypeArg
+    | UIntDtypeArg
     | StrDtypeArg
     | BytesDtypeArg
     | FloatDtypeArg
     | ComplexDtypeArg
     | TimedeltaDtypeArg
     | TimestampDtypeArg
     | CategoryDtypeArg
-    | ExtensionDtype
-    | type[object]
+    | ObjectDtypeArg
+    | VoidDtypeArg
+    | DtypeObj
 )
-# DtypeArg specifies all allowable dtypes in a functions its dtype argument
-DtypeObj: TypeAlias = np.dtype[np.generic] | ExtensionDtype
 
 # filenames and file-like-objects
 AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
diff --git a/pandas-stubs/core/series.pyi b/pandas-stubs/core/series.pyi
@@ -127,6 +127,7 @@ from pandas._typing import (
     ListLikeU,
     MaskType,
     NaPosition,
+    ObjectDtypeArg,
     QuantileInterpolation,
     RandomState,
     Renamer,
@@ -138,6 +139,8 @@ from pandas._typing import (
     TimedeltaDtypeArg,
     TimestampConvention,
     TimestampDtypeArg,
+    UIntDtypeArg,
+    VoidDtypeArg,
     WriteBuffer,
     np_ndarray_anyint,
     np_ndarray_bool,
@@ -329,7 +332,7 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]):
         is_copy: _bool | None = ...,
         **kwargs,
     ) -> Series[S1]: ...
-    def __getattr__(self, name: str) -> S1: ...
+    def __getattr__(self, name: _str) -> S1: ...
     @overload
     def __getitem__(
         self,
@@ -1152,7 +1155,7 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]):
     @overload
     def astype(
         self,
-        dtype: IntDtypeArg,
+        dtype: IntDtypeArg | UIntDtypeArg,
         copy: _bool = ...,
         errors: IgnoreRaise = ...,
     ) -> Series[int]: ...
@@ -1208,7 +1211,7 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]):
     @overload
     def astype(
         self,
-        dtype: type[object] | ExtensionDtype,
+        dtype: ObjectDtypeArg | VoidDtypeArg | ExtensionDtype | DtypeObj,
         copy: _bool = ...,
         errors: IgnoreRaise = ...,
     ) -> Series: ...
diff --git a/tests/test_series.py b/tests/test_series.py