Skip to content

ENH: Optimize replace to avoid copying when not necessary #50918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 30 additions & 6 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ def replace(
inplace: bool = False,
# mask may be pre-computed if we're called from replace_list
mask: npt.NDArray[np.bool_] | None = None,
using_cow: bool = False,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
Expand All @@ -566,7 +567,12 @@ def replace(
if isinstance(values, Categorical):
# TODO: avoid special-casing
# GH49404
blk = self if inplace else self.copy()
if using_cow and (self.refs.has_reference() or not inplace):
blk = self.copy()
elif using_cow:
blk = self.copy(deep=False)
else:
blk = self if inplace else self.copy()
values = cast(Categorical, blk.values)
values._replace(to_replace=to_replace, value=value, inplace=True)
return [blk]
Expand All @@ -576,22 +582,36 @@ def replace(
# replacing it is a no-op.
# Note: If to_replace were a list, NDFrame.replace would call
# replace_list instead of replace.
return [self] if inplace else [self.copy()]
if using_cow:
return [self.copy(deep=False)]
else:
return [self] if inplace else [self.copy()]

if mask is None:
mask = missing.mask_missing(values, to_replace)
if not mask.any():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we pass an axis(I thinkaxis=0 should work, need to be careful of 1-D values/mask though), than we should have enough info to split the block.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep agree, but right now this is not that high on my list of priorities. With the ref tracking how it is right now, this only defers the copy a little bit

# Note: we get here with test_replace_extension_other incorrectly
# bc _can_hold_element is incorrect.
return [self] if inplace else [self.copy()]
if using_cow:
return [self.copy(deep=False)]
else:
return [self] if inplace else [self.copy()]

elif self._can_hold_element(value):
blk = self if inplace else self.copy()
# TODO(CoW): Maybe split here as well into columns where mask has True
# and rest?
if using_cow:
if inplace:
blk = self.copy(deep=self.refs.has_reference())
else:
blk = self.copy()
else:
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
if not (self.is_object and value is None):
# if the user *explicitly* gave None, we keep None, otherwise
# may downcast to NaN
blocks = blk.convert(copy=False)
blocks = blk.convert(copy=False, using_cow=using_cow)
else:
blocks = [blk]
return blocks
Expand Down Expand Up @@ -619,6 +639,7 @@ def replace(
value=value,
inplace=True,
mask=mask[i : i + 1],
using_cow=using_cow,
)
)
return blocks
Expand Down Expand Up @@ -797,7 +818,10 @@ def _replace_coerce(
return [nb]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace, value=value, inplace=inplace, mask=mask
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
)

# ---------------------------------------------------------------------
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,11 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
assert not is_list_like(to_replace)
assert not is_list_like(value)
return self.apply(
"replace", to_replace=to_replace, value=value, inplace=inplace
"replace",
to_replace=to_replace,
value=value,
inplace=inplace,
using_cow=using_copy_on_write(),
)

def replace_regex(self, **kwargs):
Expand Down
38 changes: 0 additions & 38 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,44 +1210,6 @@ def test_items(using_copy_on_write):
assert df.loc[0, name] == 0


@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_putmask(using_copy_on_write, dtype):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
Expand Down
180 changes: 170 additions & 10 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,194 @@
from pandas.tests.copy_view.util import get_array


def test_replace_categorical_inplace_reference(using_copy_on_write):
df = DataFrame({"a": Categorical([1, 2, 3])})
@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)


def test_replace_mask_all_false_second_block(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
df_orig = df.copy()

df2 = df.replace(to_replace=1.5, value=55.5)

if using_copy_on_write:
# TODO: Block splitting would allow us to avoid copying b
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

else:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

df2.loc[0, "c"] = 1
tm.assert_frame_equal(df, df_orig) # Original is unchanged

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
# TODO: This should split and not copy the whole block
# assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))


def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()

df2 = df.replace(to_replace=1.5, value="a")

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

elif not using_array_manager:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

if using_copy_on_write:
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_to_replace_wrong_dtype(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()

df2 = df.replace(to_replace="xxx", value=1.5)

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_inplace(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)

assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
def test_replace_inplace_reference(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=[1], value=2, inplace=True)
df.replace(to_replace=to_replace, value=15.5, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert np.shares_memory(get_array(df, "a"), arr_a)


def test_replace_inplace_reference(using_copy_on_write):
@pytest.mark.parametrize("to_replace", ["a", 100.5])
def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=[1.5], value=15.5, inplace=True)
df.replace(to_replace=to_replace, value=15.5, inplace=True)

assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)


@pytest.mark.parametrize("to_replace", [1, [1]])
@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=val, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)


@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
arr_a = get_array(df, "a")
df.replace(to_replace=1, value=val, inplace=True)

assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)

expected = DataFrame({"a": Categorical([val, 2, 3])})
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
df2 = df.replace(to_replace=1, value=val)

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert df2._mgr._has_no_reference(0)
assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
tm.assert_frame_equal(df, df_orig)

arr_a = get_array(df2, "a").codes
df2.iloc[0, 0] = 2.0
assert np.shares_memory(get_array(df2, "a").codes, arr_a)


@pytest.mark.parametrize("method", ["where", "mask"])
Expand Down