Skip to content

PERF: Only clear cached .levels when setting MultiIndex.names #59578

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ Performance improvements
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
- Performance improvement in :class:`MultiIndex` when setting :attr:`MultiIndex.names` doesn't invalidate all cached operations (:issue:`59578`)
- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
- Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
Expand Down
24 changes: 11 additions & 13 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ def dtypes(self) -> Series:
"""
from pandas import Series

names = com.fill_missing_names([level.name for level in self.levels])
names = com.fill_missing_names(self.names)
return Series([level.dtype for level in self.levels], index=Index(names))

def __len__(self) -> int:
Expand Down Expand Up @@ -1572,7 +1572,7 @@ def _format_multi(
def _get_names(self) -> FrozenList:
return FrozenList(self._names)

def _set_names(self, names, *, level=None, validate: bool = True) -> None:
def _set_names(self, names, *, level=None) -> None:
"""
Set new names on index. Each name has to be a hashable type.

Expand All @@ -1583,8 +1583,6 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
level : int, level name, or sequence of int/level names (default None)
If the index is a MultiIndex (hierarchical), level(s) to set (None
for all levels). Otherwise level must be None
validate : bool, default True
validate that the names match level lengths

Raises
------
Expand All @@ -1603,13 +1601,12 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
raise ValueError("Names should be list-like for a MultiIndex")
names = list(names)

if validate:
if level is not None and len(names) != len(level):
raise ValueError("Length of names must match length of level.")
if level is None and len(names) != self.nlevels:
raise ValueError(
"Length of names must match number of levels in MultiIndex."
)
if level is not None and len(names) != len(level):
raise ValueError("Length of names must match length of level.")
if level is None and len(names) != self.nlevels:
raise ValueError(
"Length of names must match number of levels in MultiIndex."
)

if level is None:
level = range(self.nlevels)
Expand All @@ -1627,8 +1624,9 @@ def _set_names(self, names, *, level=None, validate: bool = True) -> None:
)
self._names[lev] = name

# If .levels has been accessed, the names in our cache will be stale.
self._reset_cache()
# If .levels has been accessed, the .name of each level in our cache
# will be stale.
self._reset_cache("levels")

names = property(
fset=_set_names,
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/indexing/multiindex/test_chaining_and_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pandas import (
DataFrame,
MultiIndex,
RangeIndex,
Series,
)
import pandas._testing as tm
Expand Down Expand Up @@ -68,3 +69,19 @@ def test_indexer_caching(monkeypatch):
s[s == 0] = 1
expected = Series(np.ones(size_cutoff), index=index)
tm.assert_series_equal(s, expected)


def test_set_names_only_clears_level_cache():
mi = MultiIndex.from_arrays([range(4), range(4)], names=["a", "b"])
mi.dtypes
mi.is_monotonic_increasing
mi._engine
mi.levels
old_cache_keys = sorted(mi._cache.keys())
assert old_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing", "levels"]
mi.names = ["A", "B"]
new_cache_keys = sorted(mi._cache.keys())
assert new_cache_keys == ["_engine", "dtypes", "is_monotonic_increasing"]
new_levels = mi.levels
tm.assert_index_equal(new_levels[0], RangeIndex(4, name="A"))
tm.assert_index_equal(new_levels[1], RangeIndex(4, name="B"))