Skip to content

fix series.isin slow issue with Dtype IntegerArray #38379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 39 commits into from
Jan 20, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
109c0e7
fix series.isin slow issue with Dtype IntegerArray
tushushu Dec 9, 2020
e9f96ea
Move isinstance(comps, IntegerArray) to algo.isin
tushushu Dec 9, 2020
a6be9c8
cannot import IntegerArray due to circular import
tushushu Dec 9, 2020
415b590
fix bug in pandas (Linux py38_np_dev)
tushushu Dec 9, 2020
f3e5afb
fix pre commit issue.
tushushu Dec 9, 2020
14579fc
fix the code style issue.
tushushu Dec 9, 2020
562c918
move the logic to elif block.
tushushu Dec 11, 2020
1449d3c
remove blank line.
tushushu Dec 11, 2020
3ccc917
copy codes from #38422
tushushu Dec 27, 2020
98a0683
make `isin` correct for pd.NA
tushushu Dec 27, 2020
6e2917e
sort imports
tushushu Dec 27, 2020
a4b6503
Avoiding import pandas as pd.
tushushu Dec 31, 2020
f95fde9
fix cannot import NA issue.
tushushu Dec 31, 2020
2348a60
Merge pull request #1 from pandas-dev/master
tushushu Jan 2, 2021
c963183
Merge remote-tracking branch 'upstream/master' into ENH-implement-fas…
tushushu Jan 7, 2021
c151102
Merge pull request #2 from pandas-dev/master
tushushu Jan 7, 2021
ef99e86
Merge branch 'ENH-implement-fast-isin' of github.com:tushushu/pandas …
tushushu Jan 7, 2021
f23ba94
Merge remote-tracking branch 'origin/master' into ENH-implement-fast-…
tushushu Jan 9, 2021
13dc64f
Adding Int64 and Float64 for benchmarks.
tushushu Jan 9, 2021
cc38088
Adding isin benchmarks for Boolean array
tushushu Jan 9, 2021
94846cc
Adding what's new note.
tushushu Jan 9, 2021
f4cb5ce
fix IsInFloat64 benchmarks
tushushu Jan 9, 2021
2ee8b05
always return false for null values.
tushushu Jan 9, 2021
7763b18
fix flake8 error.
tushushu Jan 9, 2021
87dfff9
remove unused lines.
tushushu Jan 10, 2021
d2d32d1
refactors for series benchmarks.
tushushu Jan 10, 2021
90ef57a
fix flake8 errors.
tushushu Jan 10, 2021
a48e00b
Change back to see if can pass the tests.
tushushu Jan 10, 2021
2279519
Merge remote-tracking branch 'upstream/master' into ENH-implement-fas…
tushushu Jan 14, 2021
c726d4a
makes NA isin [NA] return True.
tushushu Jan 14, 2021
1134ad6
remove redundant codes.
tushushu Jan 14, 2021
bce0e3e
makes performance better.
tushushu Jan 14, 2021
bf788e5
fix flake8 errors.
tushushu Jan 14, 2021
40950be
polish codes
tushushu Jan 16, 2021
570d640
not import NA
tushushu Jan 16, 2021
0f89578
fix code style
tushushu Jan 16, 2021
199c11c
fix black error.
tushushu Jan 16, 2021
9f35b5b
fix CI
tushushu Jan 17, 2021
2238dc5
Merge remote-tracking branch 'upstream/master' into ENH-implement-fas…
tushushu Jan 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ objects.
api.extensions.ExtensionArray.equals
api.extensions.ExtensionArray.factorize
api.extensions.ExtensionArray.fillna
api.extensions.ExtensionArray.isin
api.extensions.ExtensionArray.isna
api.extensions.ExtensionArray.ravel
api.extensions.ExtensionArray.repeat
Expand Down
10 changes: 3 additions & 7 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,10 +449,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:

comps = _ensure_arraylike(comps)
comps = extract_array(comps, extract_numpy=True)
if is_categorical_dtype(comps.dtype):
# TODO(extension)
# handle categoricals
return cast("Categorical", comps).isin(values)
if is_extension_array_dtype(comps.dtype):
return comps.isin(values)

if needs_i8_conversion(comps.dtype):
# Dispatch to DatetimeLikeArrayMixin.isin
Expand All @@ -464,9 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
elif needs_i8_conversion(values.dtype):
return isin(comps, values.astype(object))

elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype(
values.dtype
):
elif is_extension_array_dtype(values.dtype):
return isin(np.asarray(comps), np.asarray(values))

# GH16012
Expand Down
19 changes: 18 additions & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from pandas.core.dtypes.missing import isna

from pandas.core import ops
from pandas.core.algorithms import factorize_array, unique
from pandas.core.algorithms import factorize_array, isin, unique
from pandas.core.missing import get_fill_func
from pandas.core.sorting import nargminmax, nargsort

Expand Down Expand Up @@ -78,6 +78,7 @@ class ExtensionArray:
factorize
fillna
equals
isin
isna
ravel
repeat
Expand Down Expand Up @@ -833,6 +834,22 @@ def equals(self, other: object) -> bool:
equal_na = self.isna() & other.isna()
return bool((equal_values | equal_na).all())

def isin(self, values) -> np.ndarray:
"""
Pointwise comparison for set containment in the given values.

Roughly equivalent to `np.array([x in values for x in self])`

Parameters
----------
values : Sequence

Returns
-------
np.ndarray[bool]
"""
return isin(np.asarray(self), values)

def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
"""
Return an array and missing value suitable for factorization.
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,17 @@
)
from pandas.core.dtypes.missing import isna, notna

import pandas as pd
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you avoid this import

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course~

from pandas.core import nanops
from pandas.core.algorithms import factorize_array, take
from pandas.core.algorithms import factorize_array, isin, take
from pandas.core.array_algos import masked_reductions
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import ExtensionArray
from pandas.core.indexers import check_array_indexer

if TYPE_CHECKING:
from pandas import Series
from pandas.core.arrays import BooleanArray


BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
Expand Down Expand Up @@ -299,6 +301,16 @@ def take(

return type(self)(result, mask, copy=False)

def isin(self, values) -> "BooleanArray":

from pandas.core.arrays import BooleanArray

result = isin(self._data, values) * np.invert(self._mask)
if any(x is pd.NA for x in values):
result += self._mask
mask = np.zeros_like(self, dtype=bool)
return BooleanArray(result, mask, copy=False)

def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
data, mask = self._data, self._mask
data = data.copy()
Expand Down