Skip to content

Backport PR #55518 on branch 2.1.x (CoW: Use exponential backoff when clearing dead references) #55625

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.join` where result has missing values and dtype is arrow backed string (:issue:`55348`)
- Fixed regression in :meth:`DataFrame.resample` which was extrapolating back to ``origin`` when ``origin`` was outside its bounds (:issue:`55064`)
- Fixed regression in :meth:`DataFrame.sort_index` which was not sorting correctly when the index was a sliced :class:`MultiIndex` (:issue:`55379`)
- Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)

.. ---------------------------------------------------------------------------
.. _whatsnew_212.bug_fixes:
Expand Down
24 changes: 18 additions & 6 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -944,17 +944,29 @@ cdef class BlockValuesRefs:
"""
cdef:
public list referenced_blocks
public int clear_counter

def __cinit__(self, blk: SharedBlock | None = None) -> None:
if blk is not None:
self.referenced_blocks = [weakref.ref(blk)]
else:
self.referenced_blocks = []

def _clear_dead_references(self) -> None:
self.referenced_blocks = [
ref for ref in self.referenced_blocks if ref() is not None
]
self.clear_counter = 500 # set reasonably high

def _clear_dead_references(self, force=False) -> None:
# Use exponential backoff to decide when we want to clear references
# if force=False. Clearing for every insertion causes slowdowns if
# all these objects stay alive, e.g. df.items() for wide DataFrames
# see GH#55245 and GH#55008
if force or len(self.referenced_blocks) > self.clear_counter:
self.referenced_blocks = [
ref for ref in self.referenced_blocks if ref() is not None
]
nr_of_refs = len(self.referenced_blocks)
if nr_of_refs < self.clear_counter // 2:
self.clear_counter = max(self.clear_counter // 2, 500)
elif nr_of_refs > self.clear_counter:
self.clear_counter = max(self.clear_counter * 2, nr_of_refs)

def add_reference(self, blk: SharedBlock) -> None:
"""Adds a new reference to our reference collection.
Expand Down Expand Up @@ -988,6 +1000,6 @@ cdef class BlockValuesRefs:
-------
bool
"""
self._clear_dead_references()
self._clear_dead_references(force=True)
# Checking for more references than block pointing to itself
return len(self.referenced_blocks) > 1
30 changes: 30 additions & 0 deletions pandas/tests/copy_view/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,33 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
else:
for col in df.columns:
assert not np.shares_memory(get_array(df, col), get_array(df2, col))


def test_exponential_backoff():
# GH#55518
df = DataFrame({"a": [1, 2, 3]})
for i in range(490):
df.copy(deep=False)

assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491

df = DataFrame({"a": [1, 2, 3]})
dfs = [df.copy(deep=False) for i in range(510)]

for i in range(20):
df.copy(deep=False)
assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
assert df._mgr.blocks[0].refs.clear_counter == 1000

for i in range(500):
df.copy(deep=False)

# Don't reduce since we still have over 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 1000

dfs = dfs[:300]
for i in range(500):
df.copy(deep=False)

# Reduce since there are less than 500 objects alive
assert df._mgr.blocks[0].refs.clear_counter == 500