pandas-dev · lithomas1 · Feb 26, 2023 · Jan 20, 2023 · Jan 20, 2023 · Jan 20, 2023
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -552,6 +552,7 @@ def replace(
         inplace: bool = False,
         # mask may be pre-computed if we're called from replace_list
         mask: npt.NDArray[np.bool_] | None = None,
+        using_cow: bool = False,
     ) -> list[Block]:
         """
         replace the to_replace value with value, possible to create new
@@ -566,7 +567,12 @@ def replace(
         if isinstance(values, Categorical):
             # TODO: avoid special-casing
             # GH49404
-            blk = self if inplace else self.copy()
+            if using_cow and (self.refs.has_reference() or not inplace):
+                blk = self.copy()
+            elif using_cow:
+                blk = self.copy(deep=False)
+            else:
+                blk = self if inplace else self.copy()
             values = cast(Categorical, blk.values)
             values._replace(to_replace=to_replace, value=value, inplace=True)
             return [blk]
@@ -576,22 +582,36 @@ def replace(
             #  replacing it is a no-op.
             # Note: If to_replace were a list, NDFrame.replace would call
             #  replace_list instead of replace.
-            return [self] if inplace else [self.copy()]
+            if using_cow:
+                return [self.copy(deep=False)]
+            else:
+                return [self] if inplace else [self.copy()]
 
         if mask is None:
             mask = missing.mask_missing(values, to_replace)
         if not mask.any():
             # Note: we get here with test_replace_extension_other incorrectly
             #  bc _can_hold_element is incorrect.
-            return [self] if inplace else [self.copy()]
+            if using_cow:
+                return [self.copy(deep=False)]
+            else:
+                return [self] if inplace else [self.copy()]
 
         elif self._can_hold_element(value):
-            blk = self if inplace else self.copy()
+            # TODO(CoW): Maybe split here as well into columns where mask has True
+            # and rest?
+            if using_cow:
+                if inplace:
+                    blk = self.copy(deep=self.refs.has_reference())
+                else:
+                    blk = self.copy()
+            else:
+                blk = self if inplace else self.copy()
             putmask_inplace(blk.values, mask, value)
             if not (self.is_object and value is None):
                 # if the user *explicitly* gave None, we keep None, otherwise
                 #  may downcast to NaN
-                blocks = blk.convert(copy=False)
+                blocks = blk.convert(copy=False, using_cow=using_cow)
             else:
                 blocks = [blk]
             return blocks
@@ -619,6 +639,7 @@ def replace(
                         value=value,
                         inplace=True,
                         mask=mask[i : i + 1],
+                        using_cow=using_cow,
                     )
                 )
             return blocks
@@ -797,7 +818,10 @@ def _replace_coerce(
                     return [nb]
                 return [self] if inplace else [self.copy()]
             return self.replace(
-                to_replace=to_replace, value=value, inplace=inplace, mask=mask
+                to_replace=to_replace,
+                value=value,
+                inplace=inplace,
+                mask=mask,
             )
 
     # ---------------------------------------------------------------------

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -459,7 +459,11 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
         assert not is_list_like(to_replace)
         assert not is_list_like(value)
         return self.apply(
-            "replace", to_replace=to_replace, value=value, inplace=inplace
+            "replace",
+            to_replace=to_replace,
+            value=value,
+            inplace=inplace,
+            using_cow=using_copy_on_write(),
         )
 
     def replace_regex(self, **kwargs):

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -1210,44 +1210,6 @@ def test_items(using_copy_on_write):
                 assert df.loc[0, name] == 0
 
 
-@pytest.mark.parametrize(
-    "replace_kwargs",
-    [
-        {"to_replace": {"a": 1, "b": 4}, "value": -1},
-        # Test CoW splits blocks to avoid copying unchanged columns
-        {"to_replace": {"a": 1}, "value": -1},
-        {"to_replace": {"b": 4}, "value": -1},
-        {"to_replace": {"b": {4: 1}}},
-        # TODO: Add these in a further optimization
-        # We would need to see which columns got replaced in the mask
-        # which could be expensive
-        # {"to_replace": {"b": 1}},
-        # 1
-    ],
-)
-def test_replace(using_copy_on_write, replace_kwargs):
-    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
-    df_orig = df.copy()
-
-    df_replaced = df.replace(**replace_kwargs)
-
-    if using_copy_on_write:
-        if (df_replaced["b"] == df["b"]).all():
-            assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
-        assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
-
-    # mutating squeezed df triggers a copy-on-write for that column/block
-    df_replaced.loc[0, "c"] = -1
-    if using_copy_on_write:
-        assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
-
-    if "a" in replace_kwargs["to_replace"]:
-        arr = get_array(df_replaced, "a")
-        df_replaced.loc[0, "a"] = 100
-        assert np.shares_memory(get_array(df_replaced, "a"), arr)
-    tm.assert_frame_equal(df, df_orig)
-
-
 @pytest.mark.parametrize("dtype", ["int64", "Int64"])
 def test_putmask(using_copy_on_write, dtype):
     df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)

diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
@@ -9,34 +9,194 @@
 from pandas.tests.copy_view.util import get_array
 
 
-def test_replace_categorical_inplace_reference(using_copy_on_write):
-    df = DataFrame({"a": Categorical([1, 2, 3])})
+@pytest.mark.parametrize(
+    "replace_kwargs",
+    [
+        {"to_replace": {"a": 1, "b": 4}, "value": -1},
+        # Test CoW splits blocks to avoid copying unchanged columns
+        {"to_replace": {"a": 1}, "value": -1},
+        {"to_replace": {"b": 4}, "value": -1},
+        {"to_replace": {"b": {4: 1}}},
+        # TODO: Add these in a further optimization
+        # We would need to see which columns got replaced in the mask
+        # which could be expensive
+        # {"to_replace": {"b": 1}},
+        # 1
+    ],
+)
+def test_replace(using_copy_on_write, replace_kwargs):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
+    df_orig = df.copy()
+
+    df_replaced = df.replace(**replace_kwargs)
+
+    if using_copy_on_write:
+        if (df_replaced["b"] == df["b"]).all():
+            assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
+        assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+
+    # mutating squeezed df triggers a copy-on-write for that column/block
+    df_replaced.loc[0, "c"] = -1
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+
+    if "a" in replace_kwargs["to_replace"]:
+        arr = get_array(df_replaced, "a")
+        df_replaced.loc[0, "a"] = 100
+        assert np.shares_memory(get_array(df_replaced, "a"), arr)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_replace_mask_all_false_second_block(using_copy_on_write):
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace=1.5, value=55.5)
+
+    if using_copy_on_write:
+        # TODO: Block splitting would allow us to avoid copying b
+        assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    else:
+        assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "c"] = 1
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+        # TODO: This should split and not copy the whole block
+        # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
+
+
+def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
     df_orig = df.copy()
+
+    df2 = df.replace(to_replace=1.5, value="a")
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    elif not using_array_manager:
+        assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    if using_copy_on_write:
+        df2.loc[0, "b"] = 0.5
+        tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+        assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+
+
+def test_replace_to_replace_wrong_dtype(using_copy_on_write):
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace="xxx", value=1.5)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    else:
+        assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "b"] = 0.5
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+
+
+def test_replace_inplace(using_copy_on_write):
+    df = DataFrame({"a": [1.5, 2, 3]})
+    arr_a = get_array(df, "a")
+    df.replace(to_replace=1.5, value=15.5, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+    if using_copy_on_write:
+        assert df._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
+def test_replace_inplace_reference(using_copy_on_write, to_replace):
+    df = DataFrame({"a": [1.5, 2, 3]})
     arr_a = get_array(df, "a")
     view = df[:]
-    df.replace(to_replace=[1], value=2, inplace=True)
+    df.replace(to_replace=to_replace, value=15.5, inplace=True)
 
     if using_copy_on_write:
-        assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
+        assert not np.shares_memory(get_array(df, "a"), arr_a)
         assert df._mgr._has_no_reference(0)
         assert view._mgr._has_no_reference(0)
-        tm.assert_frame_equal(view, df_orig)
     else:
-        assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
+        assert np.shares_memory(get_array(df, "a"), arr_a)
 
 
-def test_replace_inplace_reference(using_copy_on_write):
+@pytest.mark.parametrize("to_replace", ["a", 100.5])
+def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
     df = DataFrame({"a": [1.5, 2, 3]})
     arr_a = get_array(df, "a")
     view = df[:]
-    df.replace(to_replace=[1.5], value=15.5, inplace=True)
+    df.replace(to_replace=to_replace, value=15.5, inplace=True)
 
+    assert np.shares_memory(get_array(df, "a"), arr_a)
     if using_copy_on_write:
-        assert not np.shares_memory(get_array(df, "a"), arr_a)
+        assert not df._mgr._has_no_reference(0)
+        assert not view._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("to_replace", [1, [1]])
+@pytest.mark.parametrize("val", [1, 1.5])
+def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
+    df = DataFrame({"a": Categorical([1, 2, 3])})
+    df_orig = df.copy()
+    arr_a = get_array(df, "a")
+    view = df[:]
+    df.replace(to_replace=to_replace, value=val, inplace=True)
+
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
         assert df._mgr._has_no_reference(0)
         assert view._mgr._has_no_reference(0)
+        tm.assert_frame_equal(view, df_orig)
     else:
-        assert np.shares_memory(get_array(df, "a"), arr_a)
+        assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
+
+
+@pytest.mark.parametrize("val", [1, 1.5])
+def test_replace_categorical_inplace(using_copy_on_write, val):
+    df = DataFrame({"a": Categorical([1, 2, 3])})
+    arr_a = get_array(df, "a")
+    df.replace(to_replace=1, value=val, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
+    if using_copy_on_write:
+        assert df._mgr._has_no_reference(0)
+
+    expected = DataFrame({"a": Categorical([val, 2, 3])})
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize("val", [1, 1.5])
+def test_replace_categorical(using_copy_on_write, val):
+    df = DataFrame({"a": Categorical([1, 2, 3])})
+    df_orig = df.copy()
+    df2 = df.replace(to_replace=1, value=val)
+
+    if using_copy_on_write:
+        assert df._mgr._has_no_reference(0)
+        assert df2._mgr._has_no_reference(0)
+    assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
+    tm.assert_frame_equal(df, df_orig)
+
+    arr_a = get_array(df2, "a").codes
+    df2.iloc[0, 0] = 2.0
+    assert np.shares_memory(get_array(df2, "a").codes, arr_a)
 
 
 @pytest.mark.parametrize("method", ["where", "mask"])