pandas-dev · phofl · Feb 16, 2023 · Feb 10, 2023 · Feb 10, 2023 · Feb 10, 2023
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -1112,7 +1112,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
 - Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
-- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`)
+- Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`)
 - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
 - Performance improvement for :func:`api.types.infer_dtype` (:issue:`51054`)
 - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -1141,12 +1141,24 @@ cdef class ExtensionEngine(SharedEngine):
 
 cdef class MaskedIndexEngine(IndexEngine):
     def __init__(self, object values):
-        super().__init__(values._data)
-        self.mask = values._mask
+        super().__init__(self._get_data(values))
+        self.mask = self._get_mask(values)
+
+    def _get_data(self, object values) -> np.ndarray:
+        if hasattr(values, "_mask"):
+            return values._data
+        # We are an ArrowExtensionArray
+        return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype)
+
+    def _get_mask(self, object values) -> np.ndarray:
+        if hasattr(values, "_mask"):
+            return values._mask
+        # We are an ArrowExtensionArray
+        return values.isna()
 
     def get_indexer(self, object values) -> np.ndarray:
         self._ensure_mapping_populated()
-        return self.mapping.lookup(values._data, values._mask)
+        return self.mapping.lookup(self._get_data(values), self._get_mask(values))
 
     def get_indexer_non_unique(self, object targets):
         """
@@ -1171,8 +1183,8 @@ cdef class MaskedIndexEngine(IndexEngine):
             Py_ssize_t count = 0, count_missing = 0
             Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
 
-        target_vals = targets._data
-        target_mask = targets._mask
+        target_vals = self._get_data(targets)
+        target_mask = self._get_mask(targets)
 
         values = self.values
         assert not values.dtype == object  # go through object path instead

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -8,6 +8,7 @@
     TypeVar,
     cast,
 )
+import warnings
 
 import numpy as np
 
@@ -871,7 +872,10 @@ def to_numpy(
         ):
             result = np.array(list(self), dtype=dtype)
         else:
-            result = np.asarray(self._data, dtype=dtype)
+            with warnings.catch_warnings():
+                # int dtype with NA raises Warning
+                warnings.filterwarnings("ignore", category=RuntimeWarning)
+                result = np.asarray(self._data, dtype=dtype)
             if copy or self._hasna:
                 result = result.copy()
         if self._hasna:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -145,6 +145,7 @@
     validate_putmask,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
     Categorical,
     ExtensionArray,
@@ -221,6 +222,19 @@
     "Int16": libindex.MaskedInt16Engine,
     "Int8": libindex.MaskedInt8Engine,
     "boolean": libindex.MaskedBoolEngine,
+    "double[pyarrow]": libindex.MaskedFloat64Engine,
+    "float64[pyarrow]": libindex.MaskedFloat64Engine,
+    "float32[pyarrow]": libindex.MaskedFloat32Engine,
+    "float[pyarrow]": libindex.MaskedFloat32Engine,
+    "uint64[pyarrow]": libindex.MaskedUInt64Engine,
+    "uint32[pyarrow]": libindex.MaskedUInt32Engine,
+    "uint16[pyarrow]": libindex.MaskedUInt16Engine,
+    "uint8[pyarrow]": libindex.MaskedUInt8Engine,
+    "int64[pyarrow]": libindex.MaskedInt64Engine,
+    "int32[pyarrow]": libindex.MaskedInt32Engine,
+    "int16[pyarrow]": libindex.MaskedInt16Engine,
+    "int8[pyarrow]": libindex.MaskedInt8Engine,
+    "bool[pyarrow]": libindex.MaskedBoolEngine,
 }
 
 
@@ -795,7 +809,7 @@ def _engine(
         # For base class (object dtype) we get ObjectEngine
         target_values = self._get_engine_target()
         if isinstance(target_values, ExtensionArray):
-            if isinstance(target_values, BaseMaskedArray):
+            if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)):
                 return _masked_engines[target_values.dtype.name](target_values)
             elif self._engine_type is libindex.ObjectEngine:
                 return libindex.ExtensionEngine(target_values)
@@ -4924,6 +4938,10 @@ def _get_engine_target(self) -> ArrayLike:
             type(self) is Index
             and isinstance(self._values, ExtensionArray)
             and not isinstance(self._values, BaseMaskedArray)
+            and not (
+                isinstance(self._values, ArrowExtensionArray)
+                and is_numeric_dtype(self.dtype)
+            )
         ):
             # TODO(ExtensionIndex): remove special-case, just use self._values
             return self._values.astype(object)

diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py
@@ -317,26 +317,26 @@ def test_get_indexer_uint64(self, index_large):
         tm.assert_numpy_array_equal(indexer, expected)
 
     @pytest.mark.parametrize("val, val2", [(4, 5), (4, 4), (4, NA), (NA, NA)])
-    def test_get_loc_masked(self, val, val2, any_numeric_ea_dtype):
+    def test_get_loc_masked(self, val, val2, any_numeric_ea_and_arrow_dtype):
         # GH#39133
-        idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_dtype)
+        idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_and_arrow_dtype)
         result = idx.get_loc(2)
         assert result == 1
 
         with pytest.raises(KeyError, match="9"):
             idx.get_loc(9)
 
-    def test_get_loc_masked_na(self, any_numeric_ea_dtype):
+    def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype):
         # GH#39133
-        idx = Index([1, 2, NA], dtype=any_numeric_ea_dtype)
+        idx = Index([1, 2, NA], dtype=any_numeric_ea_and_arrow_dtype)
         result = idx.get_loc(NA)
         assert result == 2
 
-        idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_dtype)
+        idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_and_arrow_dtype)
         result = idx.get_loc(NA)
         tm.assert_numpy_array_equal(result, np.array([False, False, True, True]))
 
-        idx = Index([1, 2, 3], dtype=any_numeric_ea_dtype)
+        idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype)
         with pytest.raises(KeyError, match="NA"):
             idx.get_loc(NA)
 
@@ -371,16 +371,19 @@ def test_get_loc_masked_na_and_nan(self):
             idx.get_loc(NA)
 
     @pytest.mark.parametrize("val", [4, 2])
-    def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val):
+    def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val):
         # GH#39133
-        idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_dtype)
+        idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_and_arrow_dtype)
         result = idx.get_indexer_for([1, NA, 5])
         expected = np.array([0, 2, -1])
         tm.assert_numpy_array_equal(result, expected, check_dtype=False)
 
-    def test_get_indexer_masked_na_boolean(self):
+    @pytest.mark.parametrize("dtype", ["boolean", "bool[pyarrow]"])
+    def test_get_indexer_masked_na_boolean(self, dtype):
         # GH#39133
-        idx = Index([True, False, NA], dtype="boolean")
+        if dtype == "bool[pyarrow]":
+            pytest.importorskip("pyarrow")
+        idx = Index([True, False, NA], dtype=dtype)
         result = idx.get_loc(False)
         assert result == 1
         result = idx.get_loc(NA)