Skip to content

Commit bd00055

Browse files
lukemanleymroeschke
authored andcommitted
PERF: skip libjoin fastpath for MultiIndex (pandas-dev#54765)
* PERF: skip libjoin fastpath for MultiIndex * fix levels sort
1 parent 54da73d commit bd00055

File tree

1 file changed

+15
-27
lines changed

1 file changed

+15
-27
lines changed

pandas/core/indexes/base.py

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
from pandas.core.dtypes.generic import (
125125
ABCDataFrame,
126126
ABCDatetimeIndex,
127+
ABCIntervalIndex,
127128
ABCMultiIndex,
128129
ABCPeriodIndex,
129130
ABCSeries,
@@ -3491,8 +3492,6 @@ def _intersection(self, other: Index, sort: bool = False):
34913492
and other.is_monotonic_increasing
34923493
and self._can_use_libjoin
34933494
and other._can_use_libjoin
3494-
and not isinstance(self, ABCMultiIndex)
3495-
and not isinstance(other, ABCMultiIndex)
34963495
):
34973496
try:
34983497
res_indexer, indexer, _ = self._inner_indexer(other)
@@ -4631,28 +4630,13 @@ def join(
46314630

46324631
_validate_join_method(how)
46334632

4634-
if not self.is_unique and not other.is_unique:
4635-
return self._join_non_unique(other, how=how, sort=sort)
4636-
elif not self.is_unique or not other.is_unique:
4637-
if self.is_monotonic_increasing and other.is_monotonic_increasing:
4638-
# Note: 2023-08-15 we *do* have tests that get here with
4639-
# Categorical, string[python] (can use libjoin)
4640-
# and Interval (cannot)
4641-
if self._can_use_libjoin and other._can_use_libjoin:
4642-
# otherwise we will fall through to _join_via_get_indexer
4643-
# GH#39133
4644-
# go through object dtype for ea till engine is supported properly
4645-
return self._join_monotonic(other, how=how)
4646-
else:
4647-
return self._join_non_unique(other, how=how, sort=sort)
4648-
elif (
4649-
# GH48504: exclude MultiIndex to avoid going through MultiIndex._values
4650-
self.is_monotonic_increasing
4633+
if (
4634+
not isinstance(self.dtype, CategoricalDtype)
4635+
and self.is_monotonic_increasing
46514636
and other.is_monotonic_increasing
46524637
and self._can_use_libjoin
46534638
and other._can_use_libjoin
4654-
and not isinstance(self, ABCMultiIndex)
4655-
and not isinstance(self.dtype, CategoricalDtype)
4639+
and (self.is_unique or other.is_unique)
46564640
):
46574641
# Categorical is monotonic if data are ordered as categories, but join can
46584642
# not handle this in case of not lexicographically monotonic GH#38502
@@ -4661,6 +4645,8 @@ def join(
46614645
except TypeError:
46624646
# object dtype; non-comparable objects
46634647
pass
4648+
elif not self.is_unique or not other.is_unique:
4649+
return self._join_non_unique(other, how=how, sort=sort)
46644650

46654651
return self._join_via_get_indexer(other, how, sort)
46664652

@@ -4796,6 +4782,9 @@ def _join_non_unique(
47964782
join_idx = self.take(left_idx)
47974783
right = other.take(right_idx)
47984784
join_index = join_idx.putmask(mask, right)
4785+
if isinstance(join_index, ABCMultiIndex) and how == "outer":
4786+
# test_join_index_levels
4787+
join_index = join_index._sort_levels_monotonic()
47994788
return join_index, left_idx, right_idx
48004789

48014790
@final
@@ -5041,10 +5030,10 @@ def _can_use_libjoin(self) -> bool:
50415030
or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray))
50425031
or self.dtype == "string[python]"
50435032
)
5044-
# For IntervalIndex, the conversion to numpy converts
5045-
# to object dtype, which negates the performance benefit of libjoin
5046-
# TODO: exclude RangeIndex and MultiIndex as these also make copies?
5047-
return not isinstance(self.dtype, IntervalDtype)
5033+
# Exclude index types where the conversion to numpy converts to object dtype,
5034+
# which negates the performance benefit of libjoin
5035+
# TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone
5036+
return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex))
50485037

50495038
# --------------------------------------------------------------------
50505039
# Uncategorized Methods
@@ -5179,8 +5168,7 @@ def _get_join_target(self) -> np.ndarray:
51795168
# present
51805169
return self._values.to_numpy()
51815170

5182-
# TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create
5183-
# copies.
5171+
# TODO: exclude ABCRangeIndex case here as it copies
51845172
target = self._get_engine_target()
51855173
if not isinstance(target, np.ndarray):
51865174
raise ValueError("_can_use_libjoin should return False.")

0 commit comments

Comments
 (0)