Skip to content

CoW: Track references in unstack if there is no copy #57487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
factorize,
unique,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.arrays.categorical import factorize_from_iterable
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -231,20 +232,31 @@ def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
return new_values, mask.any(0)
# TODO: in all tests we have mask.any(0).all(); can we rely on that?

def get_result(self, values, value_columns, fill_value) -> DataFrame:
def get_result(self, obj, value_columns, fill_value) -> DataFrame:
values = obj._values
if values.ndim == 1:
values = values[:, np.newaxis]

if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError("must pass column labels for multi-column data")

values, _ = self.get_new_values(values, fill_value)
new_values, _ = self.get_new_values(values, fill_value)
columns = self.get_new_columns(value_columns)
index = self.new_index

return self.constructor(
values, index=index, columns=columns, dtype=values.dtype
result = self.constructor(
new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False
)
if isinstance(values, np.ndarray):
base, new_base = values.base, new_values.base
elif isinstance(values, NDArrayBackedExtensionArray):
base, new_base = values._ndarray.base, new_values._ndarray.base
else:
base, new_base = 1, 2 # type: ignore[assignment]
Comment on lines +250 to +255
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure this base checking always works? (compared to doing something like np.shares_memory)
From some quick tests, it seems that even if you chain multiple no-copy transformations (like we do with reshape().swapaxes().reshape()), the base object stays the same.

If the base object is backed by something non-numpy (eg parrow), this doesn't seem to be the case, but, I think in a DataFrame we probably always end up with an array with a base object that is another numpy array.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we reshape into a 3dim array in between, so anything that is 1d copies there

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was trying to come up with a (contrived) example where the base might be different:

In [58]: parr = pa.array(range(100))

In [59]: arr = np.asarray(parr).base

In [60]: arr
Out[60]: 
array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
        96, 97, 98, 99]])

In [61]: arr.base is parr
Out[61]: True

In [62]: arr.reshape(10, 10).base is arr.base
Out[62]: False

So in the above after reshaping the base is not equal to the calling array's base.

However, when putting this array in a DataFrame and getting _values, it seems we still get the array wrapped in another array, so the direct base is again a numpy array:

In [63]: df = DataFrame(arr, copy=False)

In [64]: df._values
Out[64]: 
array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
        96, 97, 98, 99]])

In [65]: df._values.base
Out[65]: 
array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
        64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
        96, 97, 98, 99]])

In [66]: df._values.base.base
Out[66]: 
<pyarrow.lib.Int64Array object at 0x7f325ca29ae0>
[
  0,
  ...
  99
]

if base is new_base:
# We can only get here if one of the dimensions is size 1
result._mgr.add_references(obj._mgr)
return result

def get_new_values(self, values, fill_value=None):
if values.ndim == 1:
Expand Down Expand Up @@ -532,9 +544,7 @@ def unstack(
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
)
return unstacker.get_result(
obj._values, value_columns=None, fill_value=fill_value
)
return unstacker.get_result(obj, value_columns=None, fill_value=fill_value)


def _unstack_frame(
Expand All @@ -550,7 +560,7 @@ def _unstack_frame(
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
else:
return unstacker.get_result(
obj._values, value_columns=obj.columns, fill_value=fill_value
obj, value_columns=obj.columns, fill_value=fill_value
)


Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2701,3 +2701,16 @@ def test_pivot_table_with_margins_and_numeric_column_names(self):
index=Index(["a", "b", "All"], name=0),
)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("m", [1, 10])
def test_unstack_shares_memory(self, m):
# GH#56633
levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
values = np.arange(m * m * 100).reshape(m * m, 100)
df = DataFrame(values, index, np.arange(100))
df_orig = df.copy()
result = df.unstack(sort=False)
assert np.shares_memory(df._values, result._values) is (m == 1)
result.iloc[0, 0] = -1
tm.assert_frame_equal(df, df_orig)