pandas-dev · jbrockmendel · Jul 26, 2020 · Jul 26, 2020 · Jul 27, 2020 · Jul 28, 2020
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -192,6 +192,90 @@ Other enhancements
 - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
 - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
 
+.. ---------------------------------------------------------------------------
+
+.. whatsnew_120.notable_bug_fixes:
+
+Notable bug fixes
+~~~~~~~~~~~~~~~~~
+
+These are bug fixes that might have notable behavior changes.
+
+Assigning with ``DataFrame.__setitem__`` consistently creates a new array
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Assigning values with ``DataFrame.__setitem__`` now consistently assigns a new array, rather than mutating inplace (:issue:`33457`, :issue:`35271`, :issue:`35266`)
+
+Previously, ``DataFrame.__setitem__`` would sometimes operate inplace on the
+underlying array, and sometimes assign a new array. Fixing this inconsistency
+can have behavior-changing implications for workloads that relied on inplace
+mutation. The two most common cases are creating a ``DataFrame`` from an array
+and slicing a ``DataFrame``.
+
+*Previous Behavior*
+
+The array would be mutated inplace for some dtypes, like NumPy's ``int64`` dtype.
+
+.. code-block:: python
+
+   >>> import pandas as pd
+   >>> import numpy as np
+   >>> a = np.array([1, 2, 3])
+   >>> df = pd.DataFrame(a, columns=['a'])
+   >>> df['a'] = 0
+   >>> a  # mutated inplace
+   array([0, 0, 0])
+
+But not others, like :class:`Int64Dtype`.
+
+.. code-block:: python
+
+   >>> import pandas as pd
+   >>> import numpy as np
+   >>> a = pd.array([1, 2, 3], dtype="Int64")
+   >>> df = pd.DataFrame(a, columns=['a'])
+   >>> df['a'] = 0
+   >>> a  # not mutated
+   <IntegerArray>
+   [1, 2, 3]
+   Length: 3, dtype: Int64
+
+
+*New Behavior*
+
+In pandas 1.1.0, ``DataFrame.__setitem__`` consistently sets on a new array rather than
+mutating the existing array inplace.
+
+.. ipython:: python
+
+For NumPy's int64 dtype
+
+   import pandas as pd
+   import numpy as np
+   a = np.array([1, 2, 3])
+   df = pd.DataFrame(a, columns=['a'])
+   df['a'] = 0
+   a  # not mutated
+
+For :class:`Int64Dtype`.
+
+   import pandas as pd
+   import numpy as np
+   a = pd.array([1, 2, 3], dtype="Int64")
+   df = pd.DataFrame(a, columns=['a'])
+   df['a'] = 0
+   a  # not mutated
+
+This also affects cases where a second ``Series`` or ``DataFrame`` is a view on a first ``DataFrame``.
+
+.. code-block:: python
+
+   df = pd.DataFrame({"A": [1, 2, 3]})
+   df2 = df[['A']]
+   df['A'] = np.array([0, 0, 0])
+
+Previously, whether ``df2`` was mutated depending on the dtype of the array being assigned to. Now, a
+new array is consistently assigned, so ``df2`` is not mutated.
 .. _whatsnew_120.api_breaking.python:
 
 Increased minimum version for Python
@@ -389,6 +473,7 @@ Indexing
 ^^^^^^^^
 
 - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`)
+- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`35417`)
 - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`)
 - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`)
 - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`)

diff --git a/pandas/_testing.py b/pandas/_testing.py
@@ -1096,6 +1096,8 @@ def _get_base(obj):
             raise AssertionError(f"{repr(left_base)} is {repr(right_base)}")
 
     def _raise(left, right, err_msg):
+        __tracebackhide__ = True
+
         if err_msg is None:
             if left.shape != right.shape:
                 raise_assert_detail(

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3114,18 +3114,18 @@ def _setitem_frame(self, key, value):
         self._check_setitem_copy()
         self._where(-key, value, inplace=True)
 
-    def _iset_item(self, loc: int, value):
+    def _iset_item(self, loc: int, value, inplace: bool = False):
         self._ensure_valid_index(value)
 
         # technically _sanitize_column expects a label, not a position,
         #  but the behavior is the same as long as we pass broadcast=False
         value = self._sanitize_column(loc, value, broadcast=False)
-        NDFrame._iset_item(self, loc, value)
+        NDFrame._iset_item(self, loc, value, inplace=inplace)
 
         # check if we are modifying a copy
         # try to set first as we want an invalid
         # value exception to occur first
-        if len(self):
+        if len(self):  # FIXME: this should depend on inplace, right?
             self._check_setitem_copy()
 
     def _set_item(self, key, value):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3378,7 +3378,7 @@ def _maybe_cache_changed(self, item, value) -> None:
         The object has called back to us saying maybe it has changed.
         """
         loc = self._info_axis.get_loc(item)
-        self._mgr.iset(loc, value)
+        self._mgr.iset(loc, value, inplace=False)
 
     @property
     def _is_cached(self) -> bool_t:
@@ -3750,8 +3750,8 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries:
         result._set_is_copy(self, copy=is_copy)
         return result
 
-    def _iset_item(self, loc: int, value) -> None:
-        self._mgr.iset(loc, value)
+    def _iset_item(self, loc: int, value, inplace: bool_t = False) -> None:
+        self._mgr.iset(loc, value, inplace=inplace)
         self._clear_item_cache()
 
     def _set_item(self, key, value) -> None:

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1789,7 +1789,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
             ser._maybe_update_cacher(clear=True)
 
         # reset the sliced object if unique
-        self.obj._iset_item(loc, ser)
+        self.obj._iset_item(loc, ser, inplace=True)
 
     def _setitem_single_block_inplace(self, indexer, value):
         """
@@ -1816,7 +1816,9 @@ def _setitem_single_block_inplace(self, indexer, value):
                 )
                 and item_labels.is_unique
             ):
-                self.obj[item_labels[indexer[info_axis]]] = value
+                col = item_labels[indexer[info_axis]]
+                loc = item_labels.get_loc(col)
+                self.obj._iset_item(loc, value, inplace=True)
                 return
 
             indexer = maybe_convert_ix(*indexer)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1679,7 +1679,7 @@ def iget(self, col):
 
     def set(self, locs, values):
         assert locs.tolist() == [0]
-        self.values = values
+        self.values[:] = values
 
     def putmask(
         self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1030,7 +1030,7 @@ def idelete(self, indexer):
         )
         self._rebuild_blknos_and_blklocs()
 
-    def iset(self, loc: Union[int, slice, np.ndarray], value):
+    def iset(self, loc: Union[int, slice, np.ndarray], value, inplace: bool = False):
         """
         Set new item in-place. Does not consolidate. Adds new Block if not
         contained in the current set of items
@@ -1082,7 +1082,7 @@ def value_getitem(placement):
         for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
             blk = self.blocks[blkno]
             blk_locs = blklocs[val_locs.indexer]
-            if blk.should_store(value):
+            if inplace and blk.should_store(value):
                 blk.set(blk_locs, value_getitem(val_locs))
             else:
                 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -881,7 +881,7 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):
 
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            sliced["C"] = 4.0
+            sliced.loc[:, "C"] = 4.0
 
         assert (float_frame["C"] == 4).all()
 
@@ -1590,7 +1590,7 @@ def test_iloc_row(self):
         # setting it makes it raise/warn
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            result[2] = 0.0
+            result.loc[:, 2] = 0.0
 
         exp_col = df[2].copy()
         exp_col[4:8] = 0.0
@@ -1622,7 +1622,7 @@ def test_iloc_col(self):
         # and that we are setting a copy
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            result[8] = 0.0
+            result.loc[:, 8] = 0.0
 
         assert (df[8] == 0).all()
 

diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py
@@ -140,7 +140,7 @@ def test_rename_multiindex(self):
 
     def test_rename_nocopy(self, float_frame):
         renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
-        renamed["foo"] = 1.0
+        renamed["foo"][:] = 1.0
         assert (float_frame["C"] == 1.0).all()
 
     def test_rename_inplace(self, float_frame):

diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -322,7 +322,7 @@ def test_copy_blocks(self, float_frame):
         blocks = df._to_dict_of_blocks(copy=True)
         for dtype, _df in blocks.items():
             if column in _df:
-                _df.loc[:, column] = _df[column] + 1
+                _df.loc[:, column].values[:] = _df[column] + 1
 
         # make sure we did not change the original DataFrame
         assert not _df[column].equals(df[column])
@@ -334,12 +334,14 @@ def test_no_copy_blocks(self, float_frame):
 
         # use the copy=False, change a column
         blocks = df._to_dict_of_blocks(copy=False)
-        for dtype, _df in blocks.items():
+        for _, _df in blocks.items():
             if column in _df:
-                _df.loc[:, column] = _df[column] + 1
+                _df.loc[:, column].values[:] = _df[column] + 1
+                # FIXME: I think the need for .values here means we are
+                #  doing something wrong
 
         # make sure we did change the original DataFrame
-        assert _df[column].equals(df[column])
+        tm.assert_series_equal(df[column], _df[column])
 
     def test_copy(self, float_frame, float_string_frame):
         cop = float_frame.copy()

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -985,27 +985,6 @@ def test_apply_function_index_return(function):
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_function_with_indexing():
-    # GH: 33058
-    df = pd.DataFrame(
-        {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
-    )
-
-    def fn(x):
-        x.col2[x.index[-1]] = 0
-        return x.col2
-
-    result = df.groupby(["col1"], as_index=False).apply(fn)
-    expected = pd.Series(
-        [1, 2, 0, 4, 5, 0],
-        index=pd.MultiIndex.from_tuples(
-            [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
-        ),
-        name="col2",
-    )
-    tm.assert_series_equal(result, expected)
-
-
 def test_apply_function_with_indexing_return_column():
     # GH: 7002
     df = DataFrame(

diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -684,7 +684,7 @@ def test_identity_slice_returns_new_object(self):
         assert sliced_df is not original_df
 
         # should be a shallow copy
-        original_df["a"] = [4, 4, 4]
+        original_df.loc[:, "a"] = [4, 4, 4]
         assert (sliced_df["a"] == 4).all()
 
         original_series = Series([1, 2, 3, 4, 5, 6])
@@ -708,8 +708,8 @@ def test_series_indexing_zerodim_np_array(self):
         result = s.iloc[np.array(0)]
         assert result == 1
 
-    @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457")
     def test_iloc_setitem_categorical_updates_inplace(self):
+        # GH#35417
         # Mixed dtype ensures we go through take_split_path in setitem_with_indexer
         cat = pd.Categorical(["A", "B", "C"])
         df = pd.DataFrame({1: cat, 2: [1, 2, 3]})

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -871,7 +871,7 @@ def test_identity_slice_returns_new_object(self):
         assert original_df[:] is not original_df
 
         # should be a shallow copy
-        original_df["a"] = [4, 4, 4]
+        original_df["a"][:] = [4, 4, 4]
         assert (sliced_df["a"] == 4).all()
 
         # These should not return copies