Skip to content

ENH: CoW Replace #50618

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5530,7 +5530,7 @@ def _replace_columnwise(
DataFrame or None
"""
# Operate column-wise
res = self if inplace else self.copy()
res = self if inplace else self.copy(deep=None)
ax = self.columns

for i, ax_value in enumerate(ax):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7046,6 +7046,7 @@ def replace(
to_replace = [to_replace]

if isinstance(to_replace, (tuple, list)):
# TODO: Consider copy-on-write for non-replaced columns's here
if isinstance(self, ABCDataFrame):
from pandas import Series

Expand Down Expand Up @@ -7105,7 +7106,7 @@ def replace(
if not self.size:
if inplace:
return None
return self.copy()
return self.copy(deep=None)

if is_dict_like(to_replace):
if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
Expand Down
77 changes: 75 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1220,8 +1220,81 @@ def value_getitem(placement):
if inplace and blk.should_store(value):
# Updating inplace -> check if we need to do Copy-on-Write
if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True)
self._clear_reference_block(blkno_l)
leftover_blocks = tuple(blk.delete(blk_locs))
vals = np.broadcast_to(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If new_block_2d doesn't copy vals, will need to make a copy since output of np.broadcast_to isn't writable.

value_getitem(val_locs), blk.values[blk_locs].shape
)

# Fill before the first leftover block
nbs = []
leftover_start_i = leftover_blocks[0].mgr_locs.as_slice.start

if leftover_start_i != 0:
nbs.append(
new_block_2d(
vals[:leftover_start_i],
placement=BlockPlacement(slice(0, leftover_start_i)),
)
)

# Every hole in between leftover blocks is where we need to insert
# a new block
curr_idx = leftover_start_i
for i in range(len(leftover_blocks) - 1):
curr_block = leftover_blocks[i]
next_block = leftover_blocks[i + 1]

curr_end = curr_block.mgr_locs.as_slice.stop
next_start = next_block.mgr_locs.as_slice.start

num_to_fill = next_start - curr_end
nb = new_block_2d(
vals[curr_idx : curr_idx + num_to_fill],
placement=BlockPlacement(
slice(curr_end, curr_end + num_to_fill)
),
)
nbs.append(nb)
curr_idx += num_to_fill

# Fill after the last leftover block
last_del_loc = blk_locs[-1] + 1
last_leftover_loc = leftover_blocks[-1].mgr_locs.as_slice.stop
if last_del_loc > last_leftover_loc:
diff = last_del_loc - last_leftover_loc
nbs.append(
new_block_2d(
vals[curr_idx : curr_idx + diff],
placement=BlockPlacement(
slice(last_leftover_loc, last_del_loc)
),
)
)
# Add new block where old block was and remaining blocks at
# the end to avoid updating all block numbers
old_blocks = self.blocks
nbs = tuple(nbs)
nb = nbs[0]
new_blocks = (
old_blocks[:blkno_l]
+ (nb,)
+ old_blocks[blkno_l + 1 :]
+ leftover_blocks
+ tuple(nbs[1:])
)
if self.refs is not None:
prev_refs = self.refs[blkno_l]
extra_refs = [prev_refs] * len(leftover_blocks)
self.refs[blkno_l] = None
self.refs += extra_refs
if len(blk_locs) > 1:
# Add the refs for the other deleted blocks
self.refs += [None] * (len(blk_locs) - 1)
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
for i, nb in enumerate(leftover_blocks + nbs[1:]):
self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
self._blknos[nb.mgr_locs.indexer] = i + len(old_blocks)
self.blocks = new_blocks
else:
blk.set_inplace(blk_locs, value_getitem(val_locs))
else:
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/copy_view/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array


Expand Down Expand Up @@ -93,3 +94,47 @@ def test_switch_options():
subset.iloc[0, 0] = 0
# df updated with CoW disabled
assert df.iloc[0, 0] == 0


@td.skip_array_manager_invalid_test
@pytest.mark.parametrize(
"locs, arr",
[
([0], np.array([-1, -2, -3])),
([1], np.array([-1, -2, -3])),
([5], np.array([-1, -2, -3])),
([0, 1], np.array([-1, -2, -3])),
([0, 2], np.array([-1, -2, -3])),
([0, 1, 2], np.array([-1, -2, -3])),
([1, 2], np.array([-1, -2, -3])),
([1, 3], np.array([-1, -2, -3])),
([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
],
)
def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr):
# Nothing currently calls iset with
# more than 1 loc with inplace=True (only happens with inplace=False)
# but ensure that it works
df = DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [7, 8, 9],
"d": [10, 11, 12],
"e": [13, 14, 15],
"f": ["foo", "bar", "baz"],
}
)
df_orig = df.copy()
df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy)
df2._mgr.iset(locs, arr, inplace=True)

tm.assert_frame_equal(df, df_orig)

if using_copy_on_write:
for i, col in enumerate(df.columns):
if i not in locs:
assert np.shares_memory(get_array(df, col), get_array(df2, col))
else:
for col in df.columns:
assert not np.shares_memory(get_array(df, col), get_array(df2, col))
34 changes: 34 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,40 @@ def test_squeeze(using_copy_on_write):
assert df.loc[0, "a"] == 0


@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "a"))

tm.assert_frame_equal(df, df_orig)


def test_putmask(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2})
view = df[:]
Expand Down