Skip to content

Commit a1ce4fc

Browse files
committed
Make df[col] insert a new array, never overwrite
1 parent 06e2793 commit a1ce4fc

File tree

9 files changed

+35
-97
lines changed

9 files changed

+35
-97
lines changed

pandas/_testing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,8 @@ def _get_base(obj):
10891089
raise AssertionError(f"{repr(left_base)} is {repr(right_base)}")
10901090

10911091
def _raise(left, right, err_msg):
1092+
__tracebackhide__ = True
1093+
10921094
if err_msg is None:
10931095
if left.shape != right.shape:
10941096
raise_assert_detail(

pandas/core/internals/blocks.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -634,20 +634,6 @@ def _can_hold_element(self, element: Any) -> bool:
634634
return issubclass(tipo.type, dtype)
635635
return isinstance(element, dtype)
636636

637-
def should_store(self, value: ArrayLike) -> bool:
638-
"""
639-
Should we set self.values[indexer] = value inplace or do we need to cast?
640-
641-
Parameters
642-
----------
643-
value : np.ndarray or ExtensionArray
644-
645-
Returns
646-
-------
647-
bool
648-
"""
649-
return is_dtype_equal(value.dtype, self.dtype)
650-
651637
def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
652638
""" convert to our native types format """
653639
values = self.values
@@ -1581,12 +1567,6 @@ def iget(self, col):
15811567
raise IndexError(f"{self} only contains one item")
15821568
return self.values
15831569

1584-
def should_store(self, value: ArrayLike) -> bool:
1585-
"""
1586-
Can we set the given array-like value inplace?
1587-
"""
1588-
return isinstance(value, self._holder)
1589-
15901570
def set(self, locs, values):
15911571
assert locs.tolist() == [0]
15921572
self.values[:] = values
@@ -1976,9 +1956,6 @@ def _can_hold_element(self, element: Any) -> bool:
19761956
element, (float, int, complex, np.float_, np.int_)
19771957
) and not isinstance(element, (bool, np.bool_))
19781958

1979-
def should_store(self, value: ArrayLike) -> bool:
1980-
return issubclass(value.dtype.type, np.complexfloating)
1981-
19821959

19831960
class IntBlock(NumericBlock):
19841961
__slots__ = ()
@@ -2142,7 +2119,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
21422119
_can_hold_element = DatetimeBlock._can_hold_element
21432120
to_native_types = DatetimeBlock.to_native_types
21442121
fill_value = np.datetime64("NaT", "ns")
2145-
should_store = Block.should_store
21462122
array_values = ExtensionBlock.array_values
21472123

21482124
@property
@@ -2616,8 +2592,6 @@ class CategoricalBlock(ExtensionBlock):
26162592
_verify_integrity = True
26172593
_can_hold_na = True
26182594

2619-
should_store = Block.should_store
2620-
26212595
def __init__(self, values, placement, ndim=None):
26222596
# coerce to categorical if we can
26232597
values = extract_array(values)

pandas/core/internals/managers.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,18 +1062,16 @@ def value_getitem(placement):
10621062
for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
10631063
blk = self.blocks[blkno]
10641064
blk_locs = blklocs[val_locs.indexer]
1065-
if blk.should_store(value):
1066-
blk.set(blk_locs, value_getitem(val_locs))
1067-
else:
1068-
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
1069-
unfit_val_locs.append(val_locs)
10701065

1071-
# If all block items are unfit, schedule the block for removal.
1072-
if len(val_locs) == len(blk.mgr_locs):
1073-
removed_blknos.append(blkno)
1074-
else:
1075-
blk.delete(blk_locs)
1076-
self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
1066+
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
1067+
unfit_val_locs.append(val_locs)
1068+
1069+
# If all block items are unfit, schedule the block for removal.
1070+
if len(val_locs) == len(blk.mgr_locs):
1071+
removed_blknos.append(blkno)
1072+
else:
1073+
blk.delete(blk_locs)
1074+
self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
10771075

10781076
if len(removed_blknos):
10791077
# Remove blocks & update blknos accordingly

pandas/tests/frame/methods/test_rename.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def test_rename_multiindex(self):
140140

141141
def test_rename_nocopy(self, float_frame):
142142
renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
143-
renamed["foo"] = 1.0
143+
renamed["foo"][:] = 1.0
144144
assert (float_frame["C"] == 1.0).all()
145145

146146
def test_rename_inplace(self, float_frame):

pandas/tests/frame/test_block_internals.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def test_copy_blocks(self, float_frame):
322322
blocks = df._to_dict_of_blocks(copy=True)
323323
for dtype, _df in blocks.items():
324324
if column in _df:
325-
_df.loc[:, column] = _df[column] + 1
325+
_df.loc[:, column].values[:] = _df[column] + 1
326326

327327
# make sure we did not change the original DataFrame
328328
assert not _df[column].equals(df[column])
@@ -334,12 +334,14 @@ def test_no_copy_blocks(self, float_frame):
334334

335335
# use the copy=False, change a column
336336
blocks = df._to_dict_of_blocks(copy=False)
337-
for dtype, _df in blocks.items():
337+
for _, _df in blocks.items():
338338
if column in _df:
339-
_df.loc[:, column] = _df[column] + 1
339+
_df.loc[:, column].values[:] = _df[column] + 1
340+
# FIXME: I think the need for .values here means we are doing something wrong
340341

341342
# make sure we did change the original DataFrame
342-
assert _df[column].equals(df[column])
343+
tm.assert_series_equal(df[column], _df[column])
344+
#assert _df[column].equals(df[column])
343345

344346
def test_copy(self, float_frame, float_string_frame):
345347
cop = float_frame.copy()

pandas/tests/groupby/test_apply.py

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -966,27 +966,6 @@ def test_apply_function_index_return(function):
966966
tm.assert_series_equal(result, expected)
967967

968968

969-
def test_apply_function_with_indexing():
970-
# GH: 33058
971-
df = pd.DataFrame(
972-
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
973-
)
974-
975-
def fn(x):
976-
x.col2[x.index[-1]] = 0
977-
return x.col2
978-
979-
result = df.groupby(["col1"], as_index=False).apply(fn)
980-
expected = pd.Series(
981-
[1, 2, 0, 4, 5, 0],
982-
index=pd.MultiIndex.from_tuples(
983-
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
984-
),
985-
name="col2",
986-
)
987-
tm.assert_series_equal(result, expected)
988-
989-
990969
def test_apply_function_with_indexing_return_column():
991970
# GH: 7002
992971
df = DataFrame(

pandas/tests/indexing/test_loc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -877,7 +877,7 @@ def test_identity_slice_returns_new_object(self):
877877
assert original_df[:] is not original_df
878878

879879
# should be a shallow copy
880-
original_df["a"] = [4, 4, 4]
880+
original_df["a"][:] = [4, 4, 4]
881881
assert (sliced_df["a"] == 4).all()
882882

883883
# These should not return copies

pandas/tests/internals/test_internals.py

Lines changed: 15 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -642,33 +642,31 @@ def test_get_numeric_data(self):
642642
item_shape=(3,),
643643
)
644644
mgr.iset(5, np.array([1, 2, 3], dtype=np.object_))
645-
646645
numeric = mgr.get_numeric_data()
647646
tm.assert_index_equal(
648647
numeric.items, pd.Index(["int", "float", "complex", "bool"])
649648
)
649+
mgr_idx = mgr.items.get_loc("float")
650+
num_idx = numeric.items.get_loc("float")
651+
650652
tm.assert_almost_equal(
651-
mgr.iget(mgr.items.get_loc("float")).internal_values(),
652-
numeric.iget(numeric.items.get_loc("float")).internal_values(),
653+
mgr.iget(mgr_idx).internal_values(),
654+
numeric.iget(num_idx).internal_values(),
653655
)
654656

655657
# Check sharing
656-
numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
658+
numeric.iget(num_idx).internal_values()[:] = [100.0, 200.0, 300.0]
657659
tm.assert_almost_equal(
658-
mgr.iget(mgr.items.get_loc("float")).internal_values(),
659-
np.array([100.0, 200.0, 300.0]),
660+
mgr.iget(mgr_idx).internal_values(), np.array([100.0, 200.0, 300.0]),
660661
)
661662

662663
numeric2 = mgr.get_numeric_data(copy=True)
663664
tm.assert_index_equal(
664665
numeric.items, pd.Index(["int", "float", "complex", "bool"])
665666
)
666-
numeric2.iset(
667-
numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
668-
)
667+
numeric2.iget(num_idx).internal_values()[:] = [1000.0, 2000.0, 3000.0]
669668
tm.assert_almost_equal(
670-
mgr.iget(mgr.items.get_loc("float")).internal_values(),
671-
np.array([100.0, 200.0, 300.0]),
669+
mgr.iget(mgr_idx).internal_values(), np.array([100.0, 200.0, 300.0]),
672670
)
673671

674672
def test_get_bool_data(self):
@@ -686,18 +684,20 @@ def test_get_bool_data(self):
686684
bools.iget(bools.items.get_loc("bool")).internal_values(),
687685
)
688686

687+
# GH#33457 setting a new array on bools does _not_ alter the original
688+
# array in-place, so mgr is unchanged
689689
bools.iset(0, np.array([True, False, True]))
690690
tm.assert_numpy_array_equal(
691691
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
692-
np.array([True, False, True]),
692+
np.array([True, True, True]),
693693
)
694694

695-
# Check sharing
695+
# Check non-sharing
696696
bools2 = mgr.get_bool_data(copy=True)
697-
bools2.iset(0, np.array([False, True, False]))
697+
bools2.blocks[0].values[:] = [[False, True, False]]
698698
tm.assert_numpy_array_equal(
699699
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
700-
np.array([True, False, True]),
700+
np.array([True, True, True]),
701701
)
702702

703703
def test_unicode_repr_doesnt_raise(self):
@@ -1181,23 +1181,6 @@ def test_binop_other(self, op, value, dtype):
11811181
tm.assert_series_equal(result, expected)
11821182

11831183

1184-
class TestShouldStore:
1185-
def test_should_store_categorical(self):
1186-
cat = pd.Categorical(["A", "B", "C"])
1187-
df = pd.DataFrame(cat)
1188-
blk = df._mgr.blocks[0]
1189-
1190-
# matching dtype
1191-
assert blk.should_store(cat)
1192-
assert blk.should_store(cat[:-1])
1193-
1194-
# different dtype
1195-
assert not blk.should_store(cat.as_ordered())
1196-
1197-
# ndarray instead of Categorical
1198-
assert not blk.should_store(np.asarray(cat))
1199-
1200-
12011184
@pytest.mark.parametrize(
12021185
"typestr, holder",
12031186
[

pandas/tests/reshape/merge/test_merge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def test_merge_nocopy(self):
284284

285285
merged = merge(left, right, left_index=True, right_index=True, copy=False)
286286

287-
merged["a"] = 6
287+
merged.loc[:, "a"] = 6
288288
assert (left["a"] == 6).all()
289289

290290
merged["d"] = "peekaboo"

0 commit comments

Comments
 (0)