pandas-dev · jreback · Jun 9, 2022 · May 16, 2022 · May 16, 2022 · May 16, 2022
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -39,6 +39,7 @@
     import pyarrow.compute as pc
 
     from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
+    from pandas.core.arrays.arrow.dtype import ArrowDtype
 
 if TYPE_CHECKING:
     from pandas import Series
@@ -48,16 +49,27 @@
 
 class ArrowExtensionArray(ExtensionArray):
     """
-    Base class for ExtensionArray backed by Arrow array.
+    Base class for ExtensionArray backed by Arrow ChunkedArray.
     """
 
     _data: pa.ChunkedArray
 
-    def __init__(self, values: pa.ChunkedArray) -> None:
-        self._data = values
+    def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
+        if pa_version_under1p01:
+            msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
+            raise ImportError(msg)
+        if isinstance(values, pa.Array):
+            self._data = pa.chunked_array([values])
+        elif isinstance(values, pa.ChunkedArray):
+            self._data = values
+        else:
+            raise ValueError(
+                f"Unsupported type '{type(values)}' for ArrowExtensionArray"
+            )
+        self._dtype = ArrowDtype(self._data.type)
 
     def __arrow_array__(self, type=None):
-        """Convert myself to a pyarrow Array or ChunkedArray."""
+        """Convert myself to a pyarrow ChunkedArray."""
         return self._data
 
     def equals(self, other) -> bool:

diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py
@@ -8,23 +8,34 @@
 
 from pandas.core.dtypes.base import StorageExtensionDtype
 
-from pandas.core.arrays.arrow import ArrowExtensionArray
-
 
 class ArrowDtype(StorageExtensionDtype):
     """
-    Base class for dtypes for BaseArrowArray subclasses.
+    Base class for dtypes for ArrowExtensionArray.
     Modeled after BaseMaskedDtype
     """
 
-    name: str
-    base = None
-    type: pa.DataType
-
     na_value = pa.NA
 
-    def __init__(self, storage="pyarrow") -> None:
-        super().__init__(storage)
+    def __init__(self, pa_dtype: pa.DataType) -> None:
+        super().__init__("pyarrow")
+        if not isinstance(pa_dtype, pa.DataType):
+            raise ValueError("pa_dtype must be an instance of a pyarrow.DataType")
+        self.pa_dtype = pa_dtype
+
+    @property
+    def type(self):
+        """
+        The scalar type for the array, e.g. ``int``
+        """
+        return self.pa_dtype
+
+    @property
+    def name(self) -> str:  # type: ignore[override]
+        """
+        A string identifying the data type.
+        """
+        return str(self.pa_dtype)
 
     @cache_readonly
     def numpy_dtype(self) -> np.dtype:
@@ -49,6 +60,8 @@ def construct_array_type(cls):
         -------
         type
         """
+        from pandas.core.arrays.arrow import ArrowExtensionArray
+
         return ArrowExtensionArray
 
     @classmethod
@@ -59,29 +72,44 @@ def construct_from_string(cls, string: str):
         Parameters
         ----------
         string : str
+            string should follow the format f"{pyarrow_type}[pyarrow]"
+            e.g. int64[pyarrow]
         """
         if not isinstance(string, str):
             raise TypeError(
                 f"'construct_from_string' expects a string, got {type(string)}"
             )
-        if string == f"{cls.name}[pyarrow]":
-            return cls(storage="pyarrow")
-        raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
+        if not string.endswith("[pyarrow]"):
+            raise TypeError(f"string {string} must end with '[pyarrow]'")
+        base_type = string.split("[pyarrow]")[0]
+        pa_dtype = getattr(pa, base_type, None)
+        if pa_dtype is None:
+            raise TypeError(f"'{base_type}' is not a valid pyarrow data type.")
+        return cls(pa_dtype())
+
+    @property
+    def _is_numeric(self) -> bool:
+        """
+        Whether columns with this dtype should be considered numeric.
+        """
+        # TODO: pa.types.is_boolean?
+        return (
+            pa.types.is_integer(self.pa_dtype)
+            or pa.types.is_floating(self.pa_dtype)
+            or pa.types.is_decimal(self.pa_dtype)
+        )
 
-    @classmethod
-    def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype:
+    @property
+    def _is_boolean(self) -> bool:
         """
-        Construct the ArrowDtype corresponding to the given numpy dtype.
+        Whether this dtype should be considered boolean.
         """
-        # TODO: This may be incomplete
-        pa_dtype = pa.from_numpy_dtype(dtype)
-        if pa_dtype is cls.type:
-            return cls()
-        raise NotImplementedError(dtype)
+        return pa.types.is_boolean(self.pa_dtype)
 
     def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
         # We unwrap any masked dtypes, find the common dtype we would use
         #  for that, then re-mask the result.
+        # Mirrors BaseMaskedDtype
         from pandas.core.dtypes.cast import find_common_type
 
         new_dtype = find_common_type(
@@ -91,12 +119,9 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
             ]
         )
         if not isinstance(new_dtype, np.dtype):
-            # If we ever support e.g. Masked[DatetimeArray] then this will change
-            return None
-        try:
-            return type(self).from_numpy_dtype(new_dtype)
-        except (KeyError, NotImplementedError):
             return None
+        pa_dtype = pa.from_numpy_dtype(new_dtype)
+        return type(self)(pa_dtype)
 
     def __from_arrow__(self, array: pa.Array | pa.ChunkedArray):
         """

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -76,7 +76,7 @@
 
 def _chk_pyarrow_available() -> None:
     if pa_version_under1p01:
-        msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray."
+        msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
         raise ImportError(msg)
 
 
@@ -132,13 +132,9 @@ class ArrowStringArray(
     """
 
     def __init__(self, values) -> None:
+        super().__init__(values)
+        # TODO: Migrate to ArrowDtype instead
         self._dtype = StringDtype(storage="pyarrow")
-        if isinstance(values, pa.Array):
-            self._data = pa.chunked_array([values])
-        elif isinstance(values, pa.ChunkedArray):
-            self._data = values
-        else:
-            raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray")
 
         if not pa.types.is_string(self._data.type):
             raise ValueError(

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -260,7 +260,7 @@ def test_constructor_raises(cls):
     if cls is pd.arrays.StringArray:
         msg = "StringArray requires a sequence of strings or pandas.NA"
     else:
-        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
+        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
 
     with pytest.raises(ValueError, match=msg):
         cls(np.array(["a", "b"], dtype="S1"))

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -59,7 +59,7 @@ def test_constructor_not_string_type_raises(array, chunked):
             pytest.skip("chunked not applicable to numpy array")
         arr = pa.chunked_array(arr)
     if array is np:
-        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowStringArray"
+        msg = "Unsupported type '<class 'numpy.ndarray'>' for ArrowExtensionArray"
     else:
         msg = re.escape(
             "ArrowStringArray requires a PyArrow (chunked) array of string type"
@@ -122,7 +122,7 @@ def test_from_sequence_wrong_dtype_raises():
     reason="pyarrow is installed",
 )
 def test_pyarrow_not_installed_raises():
-    msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed StringArray")
+    msg = re.escape("pyarrow>=1.0.0 is required for PyArrow backed")
 
     with pytest.raises(ImportError, match=msg):
         StringDtype(storage="pyarrow")

diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py
@@ -185,7 +185,7 @@ def __init__(self, values) -> None:
 
         assert values.type == pa.bool_()
         self._data = values
-        self._dtype = ArrowBoolDtype()
+        self._dtype = ArrowBoolDtype()  # type: ignore[assignment]
 
 
 class ArrowStringArray(ArrowExtensionArray):
@@ -195,4 +195,4 @@ def __init__(self, values) -> None:
 
         assert values.type == pa.string()
         self._data = values
-        self._dtype = ArrowStringDtype()
+        self._dtype = ArrowStringDtype()  # type: ignore[assignment]
diff --git a/pandas/tests/extension/arrow/test_timestamp.py b/pandas/tests/extension/arrow/test_timestamp.py
@@ -46,7 +46,7 @@ def __init__(self, values) -> None:
 
         assert values.type == pa.timestamp("us")
         self._data = values
-        self._dtype = ArrowTimestampUSDtype()
+        self._dtype = ArrowTimestampUSDtype()  # type: ignore[assignment]
 
 
 def test_constructor_extensionblock():