Skip to content

ENH: Use MaskedEngine for numeric pyarrow dtypes #51316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Feb 16, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1112,7 +1112,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`)
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`)
- Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`)
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
- Performance improvement for :func:`api.types.infer_dtype` (:issue:`51054`)
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
Expand Down
22 changes: 17 additions & 5 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1141,12 +1141,24 @@ cdef class ExtensionEngine(SharedEngine):

cdef class MaskedIndexEngine(IndexEngine):
def __init__(self, object values):
super().__init__(values._data)
self.mask = values._mask
super().__init__(self._get_data(values))
self.mask = self._get_mask(values)

def _get_data(self, object values) -> np.ndarray:
if hasattr(values, "_mask"):
return values._data
# We are an ArrowExtensionArray
return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype)

def _get_mask(self, object values) -> np.ndarray:
if hasattr(values, "_mask"):
return values._mask
# We are an ArrowExtensionArray
return values.isna()

def get_indexer(self, object values) -> np.ndarray:
self._ensure_mapping_populated()
return self.mapping.lookup(values._data, values._mask)
return self.mapping.lookup(self._get_data(values), self._get_mask(values))

def get_indexer_non_unique(self, object targets):
"""
Expand All @@ -1171,8 +1183,8 @@ cdef class MaskedIndexEngine(IndexEngine):
Py_ssize_t count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx

target_vals = targets._data
target_mask = targets._mask
target_vals = self._get_data(targets)
target_mask = self._get_mask(targets)

values = self.values
assert not values.dtype == object # go through object path instead
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
TypeVar,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -871,7 +872,10 @@ def to_numpy(
):
result = np.array(list(self), dtype=dtype)
else:
result = np.asarray(self._data, dtype=dtype)
with warnings.catch_warnings():
# int dtype with NA raises Warning
warnings.filterwarnings("ignore", category=RuntimeWarning)
result = np.asarray(self._data, dtype=dtype)
if copy or self._hasna:
result = result.copy()
if self._hasna:
Expand Down
20 changes: 19 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
validate_putmask,
)
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
Categorical,
ExtensionArray,
Expand Down Expand Up @@ -221,6 +222,19 @@
"Int16": libindex.MaskedInt16Engine,
"Int8": libindex.MaskedInt8Engine,
"boolean": libindex.MaskedBoolEngine,
"double[pyarrow]": libindex.MaskedFloat64Engine,
"float64[pyarrow]": libindex.MaskedFloat64Engine,
"float32[pyarrow]": libindex.MaskedFloat32Engine,
"float[pyarrow]": libindex.MaskedFloat32Engine,
"uint64[pyarrow]": libindex.MaskedUInt64Engine,
"uint32[pyarrow]": libindex.MaskedUInt32Engine,
"uint16[pyarrow]": libindex.MaskedUInt16Engine,
"uint8[pyarrow]": libindex.MaskedUInt8Engine,
"int64[pyarrow]": libindex.MaskedInt64Engine,
"int32[pyarrow]": libindex.MaskedInt32Engine,
"int16[pyarrow]": libindex.MaskedInt16Engine,
"int8[pyarrow]": libindex.MaskedInt8Engine,
"bool[pyarrow]": libindex.MaskedBoolEngine,
}


Expand Down Expand Up @@ -795,7 +809,7 @@ def _engine(
# For base class (object dtype) we get ObjectEngine
target_values = self._get_engine_target()
if isinstance(target_values, ExtensionArray):
if isinstance(target_values, BaseMaskedArray):
if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)):
return _masked_engines[target_values.dtype.name](target_values)
elif self._engine_type is libindex.ObjectEngine:
return libindex.ExtensionEngine(target_values)
Expand Down Expand Up @@ -4924,6 +4938,10 @@ def _get_engine_target(self) -> ArrayLike:
type(self) is Index
and isinstance(self._values, ExtensionArray)
and not isinstance(self._values, BaseMaskedArray)
and not (
isinstance(self._values, ArrowExtensionArray)
and is_numeric_dtype(self.dtype)
)
):
# TODO(ExtensionIndex): remove special-case, just use self._values
return self._values.astype(object)
Expand Down
23 changes: 13 additions & 10 deletions pandas/tests/indexes/numeric/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,26 +317,26 @@ def test_get_indexer_uint64(self, index_large):
tm.assert_numpy_array_equal(indexer, expected)

@pytest.mark.parametrize("val, val2", [(4, 5), (4, 4), (4, NA), (NA, NA)])
def test_get_loc_masked(self, val, val2, any_numeric_ea_dtype):
def test_get_loc_masked(self, val, val2, any_numeric_ea_and_arrow_dtype):
# GH#39133
idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_dtype)
idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_and_arrow_dtype)
result = idx.get_loc(2)
assert result == 1

with pytest.raises(KeyError, match="9"):
idx.get_loc(9)

def test_get_loc_masked_na(self, any_numeric_ea_dtype):
def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype):
# GH#39133
idx = Index([1, 2, NA], dtype=any_numeric_ea_dtype)
idx = Index([1, 2, NA], dtype=any_numeric_ea_and_arrow_dtype)
result = idx.get_loc(NA)
assert result == 2

idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_dtype)
idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_and_arrow_dtype)
result = idx.get_loc(NA)
tm.assert_numpy_array_equal(result, np.array([False, False, True, True]))

idx = Index([1, 2, 3], dtype=any_numeric_ea_dtype)
idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype)
with pytest.raises(KeyError, match="NA"):
idx.get_loc(NA)

Expand Down Expand Up @@ -371,16 +371,19 @@ def test_get_loc_masked_na_and_nan(self):
idx.get_loc(NA)

@pytest.mark.parametrize("val", [4, 2])
def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val):
def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val):
# GH#39133
idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_dtype)
idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_and_arrow_dtype)
result = idx.get_indexer_for([1, NA, 5])
expected = np.array([0, 2, -1])
tm.assert_numpy_array_equal(result, expected, check_dtype=False)

def test_get_indexer_masked_na_boolean(self):
@pytest.mark.parametrize("dtype", ["boolean", "bool[pyarrow]"])
def test_get_indexer_masked_na_boolean(self, dtype):
# GH#39133
idx = Index([True, False, NA], dtype="boolean")
if dtype == "bool[pyarrow]":
pytest.importorskip("pyarrow")
idx = Index([True, False, NA], dtype=dtype)
result = idx.get_loc(False)
assert result == 1
result = idx.get_loc(NA)
Expand Down