pandas-dev · jorisvandenbossche · Feb 9, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 17, 2023
diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
@@ -26,6 +26,7 @@
     is_dtype_equal,
     is_integer_dtype,
     is_object_dtype,
+    is_string_dtype,
     is_timedelta64_dtype,
     pandas_dtype,
 )
@@ -246,3 +247,45 @@ def astype_array_safe(
             raise
 
     return new_values
+
+
+def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
+    """Checks if astype avoided copying the data.
+
+    Parameters
+    ----------
+    dtype : Original dtype
+    new_dtype : target dtype
+
+    Returns
+    -------
+    True if new data is a view, False otherwise
+    """
+    if dtype == new_dtype:
+        return True
+
+    elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
+        # Only equal numpy dtypes avoid a copy
+        return False
+
+    if is_string_dtype(dtype) and is_string_dtype(new_dtype):
+        return True
+
+    elif is_object_dtype(dtype) and new_dtype.kind == "O":
+        # When the underlying array has dtype object, we don't have to make a copy
+        return True
+
+    elif is_string_dtype(dtype) or is_string_dtype(new_dtype):
+        return False
+
+    elif dtype.kind in "mM" and new_dtype.kind in "mM":
+        return True
+
+    elif getattr(dtype, "numpy_dtype", dtype) == getattr(
+        new_dtype, "numpy_dtype", new_dtype
+    ):
+        # If underlying numpy dtype is the same, no copy is made, e.g.
+        # int64 -> Int64 or int64[pyarrow]
+        return True
+
+    return False
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6154,7 +6154,7 @@ def dtypes(self):
         return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
 
     def astype(
-        self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise"
+        self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
     ) -> NDFrameT:
         """
         Cast a pandas object to a specified dtype ``dtype``.
@@ -6302,7 +6302,7 @@ def astype(
             for i, (col_name, col) in enumerate(self.items()):
                 cdt = dtype_ser.iat[i]
                 if isna(cdt):
-                    res_col = col.copy() if copy else col
+                    res_col = col.copy(deep=copy)
                 else:
                     try:
                         res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
@@ -6329,7 +6329,7 @@ def astype(
 
         # GH 33113: handle empty frame or series
         if not results:
-            return self.copy()
+            return self.copy(deep=None)
 
         # GH 19920: retain column metadata after concat
         result = concat(results, axis=1, copy=False)

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -366,7 +366,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
             "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
+    def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
+        if copy is None:
+            copy = True
+
         return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
 
     def convert(self: T, copy: bool) -> T:

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -11,9 +11,12 @@
     cast,
     final,
 )
+import weakref
 
 import numpy as np
 
+from pandas._config import using_copy_on_write
+
 from pandas._libs import (
     Timestamp,
     internals as libinternals,
@@ -38,7 +41,10 @@
 from pandas.util._decorators import cache_readonly
 from pandas.util._validators import validate_bool_kwarg
 
-from pandas.core.dtypes.astype import astype_array_safe
+from pandas.core.dtypes.astype import (
+    astype_array_safe,
+    astype_is_view,
+)
 from pandas.core.dtypes.cast import (
     LossySetitemError,
     can_hold_element,
@@ -152,6 +158,7 @@ class Block(PandasObject):
     is_extension = False
     _can_consolidate = True
     _validate_ndim = True
+    _ref = None
 
     @final
     @cache_readonly
@@ -496,6 +503,9 @@ def astype(
                 f"({self.dtype.name} [{self.shape}]) to different shape "
                 f"({newb.dtype.name} [{newb.shape}])"
             )
+        if using_copy_on_write():
+            if astype_is_view(values.dtype, new_values.dtype):
+                newb._ref = weakref.ref(self)
         return newb
 
     @final

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -435,8 +435,26 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
             "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
-        return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
+    def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
+        return self._apply_and_handle_refs(
+            "astype", copy=copy, dtype=dtype, errors=errors
+        )
+
+    def _apply_and_handle_refs(self, func: str, copy: bool | None, **kwargs):
+        if copy is None:
+            if using_copy_on_write():
+                copy = False
+            else:
+                copy = True
+
+        result = self.apply(func, copy=copy, **kwargs)
+
+        if using_copy_on_write() and not copy:
+            refs = [blk._ref for blk in result.blocks]
+            if any(ref is not None for ref in refs):
+                result.refs = refs
+                result.parent = self
+        return result
 
     def convert(self: T, copy: bool) -> T:
         return self.apply(

diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
@@ -421,3 +421,14 @@ def test_array_to_numpy_na():
     result = arr.to_numpy(na_value=True, dtype=bool)
     expected = np.array([True, True])
     tm.assert_numpy_array_equal(result, expected)
+
+
+def test_array_copy_on_write(using_copy_on_write):
+    df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object")
+    df2 = df.astype(DecimalDtype())
+    df.iloc[0, 0] = 0
+    if using_copy_on_write:
+        expected = pd.DataFrame(
+            {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype()
+        )
+        tm.assert_equal(df2.values, expected.values)
diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
@@ -1,18 +1,20 @@
 import numpy as np
+import pytest
 
 from pandas import Series
 
 # -----------------------------------------------------------------------------
 # Copy/view behaviour for Series / DataFrame constructors
 
 
-def test_series_from_series(using_copy_on_write):
+@pytest.mark.parametrize("dtype", [None, "int64"])
+def test_series_from_series(dtype, using_copy_on_write):
     # Case: constructing a Series from another Series object follows CoW rules:
     # a new object is returned and thus mutations are not propagated
     ser = Series([1, 2, 3], name="name")
 
     # default is copy=False -> new Series is a shallow copy / view of original
-    result = Series(ser)
+    result = Series(ser, dtype=dtype)
 
     # the shallow copy still shares memory
     assert np.shares_memory(ser.values, result.values)
@@ -34,7 +36,7 @@ def test_series_from_series(using_copy_on_write):
         assert np.shares_memory(ser.values, result.values)
 
     # the same when modifying the parent
-    result = Series(ser)
+    result = Series(ser, dtype=dtype)
 
     if using_copy_on_write:
         # mutating original doesn't mutate new series

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under6p0
+
 from pandas import (
     DataFrame,
     Index,
@@ -525,6 +527,138 @@ def test_to_frame(using_copy_on_write):
         tm.assert_frame_equal(df, expected)
 
 
+def test_astype_single_dtype(using_copy_on_write):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
+    df_orig = df.copy()
+    df2 = df.astype("float64")
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
-    tm.assert_frame_equal(df, df_orig)
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64")
-    tm.assert_frame_equal(df, df_orig)
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64")
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64"))
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
+def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype):
+    if new_dtype == "int64[pyarrow]" and pa_version_under6p0:
+        pytest.skip("pyarrow not installed")
+    df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(new_dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
+
+
+@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
+def test_astype_different_target_dtype(using_copy_on_write, dtype):
+    if dtype == "int32[pyarrow]" and pa_version_under6p0:
+        pytest.skip("pyarrow not installed")
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    df2 = df.astype(dtype)
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2._mgr._has_no_reference(0)
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2._mgr._has_no_reference(0)
+
+    df2.iloc[0, 0] = 5
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+)
+def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+)
+def test_astype_string_and_object_update_original(
+    using_copy_on_write, dtype, new_dtype
+):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df2 = df.astype(new_dtype)
+    df_orig = df2.copy()
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df2, df_orig)
+
+
+def test_astype_dict_dtypes(using_copy_on_write):
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
+    )
+    df_orig = df.copy()
+    df2 = df.astype({"a": "float64", "c": "float64"})
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+
+    df2.iloc[0, 1] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    tm.assert_frame_equal(df, df_orig)
+
+
 @pytest.mark.parametrize("ax", ["index", "columns"])
 def test_swapaxes_noop(using_copy_on_write, ax):
     df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -867,13 +867,16 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp
         with pytest.raises(IntCastingNaNError, match=msg):
             Series(np.array(vals), dtype=any_int_numpy_dtype)
 
-    def test_constructor_dtype_no_cast(self):
+    def test_constructor_dtype_no_cast(self, using_copy_on_write):
         # see gh-1572
         s = Series([1, 2, 3])
         s2 = Series(s, dtype=np.int64)
 
         s2[1] = 5
-        assert s[1] == 5
+        if using_copy_on_write:
+            assert s[1] == 2
+        else:
+            assert s[1] == 5
 
     def test_constructor_datelike_coercion(self):