-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
PERF: Implement DataFrame-with-scalar ops block-wise #28583
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9a617e3
3f414b1
396b4a8
56dff20
77e3241
148a8e8
fcf9735
fec86de
2abdccb
121d783
5faa820
267c7ca
0b5aa34
ad6da57
8ced97b
6b9bce0
b0d6263
524a1fb
504a12d
e968517
709b1db
274188a
8f8f527
7561f05
837f028
a6eada6
936be5f
5176a59
26d1696
01e4922
16587e2
b735d71
4dd8944
829e72a
e9d6cd9
4c65d37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,6 +76,17 @@ | |
""" | ||
|
||
|
||
def compat_2d(meth): | ||
def new_meth(self, *args, **kwargs): | ||
if self.ndim > 1: | ||
result = meth(self.ravel(), *args, **kwargs) | ||
return result.reshape(self.shape) | ||
return meth(self, *args, **kwargs) | ||
|
||
new_meth.__name__ = meth.__name__ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use functools.wraps instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure |
||
return new_meth | ||
|
||
|
||
def tz_to_dtype(tz): | ||
""" | ||
Return a datetime64[ns] dtype appropriate for the given timezone. | ||
|
@@ -361,7 +372,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): | |
"ndarray, or Series or Index containing one of those." | ||
) | ||
raise ValueError(msg.format(type(values).__name__)) | ||
if values.ndim != 1: | ||
if values.ndim not in [1, 2]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why was this changed? Seems to conflict with the error message directly below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Applicable in a few places There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yah, this is kludge-adjacent. We don't really support 2D, so don't to tell users its OK. |
||
raise ValueError("Only 1-dimensional input arrays are supported.") | ||
|
||
if values.dtype == "i8": | ||
|
@@ -818,13 +829,15 @@ def _sub_datetime_arraylike(self, other): | |
new_values[arr_mask] = iNaT | ||
return new_values.view("timedelta64[ns]") | ||
|
||
@compat_2d | ||
def _add_offset(self, offset): | ||
assert not isinstance(offset, Tick) | ||
try: | ||
if self.tz is not None: | ||
values = self.tz_localize(None) | ||
else: | ||
values = self | ||
|
||
result = offset.apply_index(values) | ||
if self.tz is not None: | ||
result = result.tz_localize(self.tz) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,11 @@ | |
from pandas._libs import Timedelta, Timestamp, lib | ||
from pandas.util._decorators import Appender | ||
|
||
from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype | ||
from pandas.core.dtypes.common import ( | ||
is_extension_array_dtype, | ||
is_list_like, | ||
is_timedelta64_dtype, | ||
) | ||
from pandas.core.dtypes.generic import ( | ||
ABCDataFrame, | ||
ABCExtensionArray, | ||
|
@@ -24,6 +28,7 @@ | |
from pandas.core.construction import extract_array | ||
from pandas.core.ops.array_ops import ( | ||
arithmetic_op, | ||
array_op, | ||
comparison_op, | ||
define_na_arithmetic_op, | ||
logical_op, | ||
|
@@ -350,7 +355,7 @@ def fill_binop(left, right, fill_value): | |
# Dispatch logic | ||
|
||
|
||
def dispatch_to_series(left, right, func, str_rep=None, axis=None): | ||
def dispatch_to_series(left, right, func, str_rep=None, axis=None, eval_kwargs=None): | ||
""" | ||
Evaluate the frame operation func(left, right) by evaluating | ||
column-by-column, dispatching to the Series implementation. | ||
|
@@ -369,11 +374,57 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): | |
""" | ||
# Note: we use iloc to access columns for compat with cases | ||
# with non-unique columns. | ||
eval_kwargs = eval_kwargs or {} | ||
|
||
import pandas.core.computation.expressions as expressions | ||
|
||
right = lib.item_from_zerodim(right) | ||
|
||
if lib.is_scalar(right) or np.ndim(right) == 0: | ||
|
||
new_blocks = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather actually actualy do this in the block manager no? (maybe put in internals/ops.py) we should actually move all block type ops there (e.g. from groupby as well). OR isolate this code in ops somewhere (maybe internals.py), so things are separated. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally agree on getting Block/BlockManager-touching things isolated in internals. Will give some thought to how/when to move this. ATM this is in Proof of Concept phase while I figure out how to handle the remaining cases. |
||
mgr = left._data | ||
for blk in mgr.blocks: | ||
# Reshape for EA Block | ||
blk_vals = blk.values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we specifically want to get the .values attribute, which can be either a ndarray or EA (also I dont think Block has to_numpy) |
||
if hasattr(blk_vals, "reshape"): | ||
# ndarray, DTA/TDA/PA | ||
blk_vals = blk_vals.reshape(blk.shape) | ||
blk_vals = blk_vals.T | ||
|
||
new_vals = array_op(blk_vals, right, func, str_rep, eval_kwargs) | ||
|
||
# Reshape for EA Block | ||
if is_extension_array_dtype(new_vals.dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my own understanding: the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep. Everything from 489-524 would boil down to something like:
|
||
from pandas.core.internals.blocks import make_block | ||
|
||
if hasattr(new_vals, "reshape"): | ||
# ndarray, DTA/TDA/PA | ||
new_vals = new_vals.reshape(blk.shape[::-1]) | ||
assert new_vals.shape[-1] == len(blk.mgr_locs) | ||
for i in range(new_vals.shape[-1]): | ||
nb = make_block(new_vals[..., i], placement=[blk.mgr_locs[i]]) | ||
new_blocks.append(nb) | ||
else: | ||
# Categorical, IntegerArray | ||
assert len(blk.mgr_locs) == 1 | ||
assert new_vals.shape == (blk.shape[-1],) | ||
nb = make_block(new_vals, placement=blk.mgr_locs, ndim=2) | ||
new_blocks.append(nb) | ||
elif blk.values.ndim == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious: what hits this case? An op on an EA that returns an ndarray? Say There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly. |
||
# need to bump up to 2D | ||
new_vals = new_vals.reshape(-1, 1) | ||
assert new_vals.T.shape == blk.shape | ||
nb = blk.make_block(new_vals.T) | ||
new_blocks.append(nb) | ||
else: | ||
assert new_vals.T.shape == blk.shape | ||
nb = blk.make_block(new_vals.T) | ||
new_blocks.append(nb) | ||
|
||
bm = type(mgr)(new_blocks, mgr.axes) | ||
return type(left)(bm) | ||
|
||
def column_op(a, b): | ||
return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} | ||
|
||
|
@@ -526,7 +577,7 @@ def wrapper(self, other): | |
lvalues = extract_array(self, extract_numpy=True) | ||
rvalues = extract_array(other, extract_numpy=True) | ||
|
||
res_values = comparison_op(lvalues, rvalues, op) | ||
res_values = comparison_op(lvalues, rvalues, op, None, {}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For readability, I'd prefer the last two argument to be keyword arguments (It's relatively self-evident what the first three arguments for, but that's IMO not the case for the last two...). Same for calls to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. |
||
|
||
return _construct_result(self, res_values, index=self.index, name=res_name) | ||
|
||
|
@@ -552,7 +603,7 @@ def wrapper(self, other): | |
lvalues = extract_array(self, extract_numpy=True) | ||
rvalues = extract_array(other, extract_numpy=True) | ||
|
||
res_values = logical_op(lvalues, rvalues, op) | ||
res_values = logical_op(lvalues, rvalues, op, None, {}) | ||
return _construct_result(self, res_values, index=self.index, name=res_name) | ||
|
||
wrapper.__name__ = op_name | ||
|
@@ -723,7 +774,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): | |
if fill_value is not None: | ||
self = self.fillna(fill_value) | ||
|
||
new_data = dispatch_to_series(self, other, op) | ||
new_data = dispatch_to_series( | ||
self, other, op, str_rep=str_rep, eval_kwargs=eval_kwargs | ||
) | ||
return self._construct_result(new_data) | ||
|
||
f.__name__ = op_name | ||
|
@@ -749,7 +802,9 @@ def f(self, other, axis=default_axis, level=None): | |
# Another DataFrame | ||
if not self._indexed_same(other): | ||
self, other = self.align(other, "outer", level=level, copy=False) | ||
new_data = dispatch_to_series(self, other, op, str_rep) | ||
new_data = dispatch_to_series( | ||
self, other, op, str_rep=str_rep, eval_kwargs={} | ||
) | ||
return self._construct_result(new_data) | ||
|
||
elif isinstance(other, ABCSeries): | ||
|
@@ -781,7 +836,9 @@ def f(self, other): | |
raise ValueError( | ||
"Can only compare identically-labeled DataFrame objects" | ||
) | ||
new_data = dispatch_to_series(self, other, op, str_rep) | ||
new_data = dispatch_to_series( | ||
self, other, op, str_rep=str_rep, eval_kwargs={} | ||
) | ||
return self._construct_result(new_data) | ||
|
||
elif isinstance(other, ABCSeries): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,7 +56,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): | |
|
||
result = libops.vec_compare(x, y, op) | ||
else: | ||
result = libops.scalar_compare(x, y, op) | ||
result = libops.scalar_compare(x.ravel(), y, op).reshape(x.shape) | ||
return result | ||
|
||
|
||
|
@@ -156,12 +156,22 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): | |
return missing.dispatch_fill_zeros(op, left, right, result) | ||
|
||
|
||
def array_op(left, right, op, str_rep, eval_kwargs): | ||
op_name = op.__name__.strip("_") | ||
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: | ||
return comparison_op(left, right, op, str_rep, eval_kwargs) | ||
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: | ||
return logical_op(left, right, op, str_rep, eval_kwargs) | ||
else: | ||
return arithmetic_op(left, right, op, str_rep, eval_kwargs) | ||
|
||
|
||
def arithmetic_op( | ||
left: Union[np.ndarray, ABCExtensionArray], | ||
right: Any, | ||
op, | ||
str_rep: str, | ||
eval_kwargs: Dict[str, str], | ||
eval_kwargs: Dict[str, bool], | ||
): | ||
""" | ||
Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... | ||
|
@@ -218,7 +228,7 @@ def arithmetic_op( | |
|
||
|
||
def comparison_op( | ||
left: Union[np.ndarray, ABCExtensionArray], right: Any, op | ||
left: Union[np.ndarray, ABCExtensionArray], right: Any, op, str_rep, eval_kwargs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, if they need to stay, could you add type hints in order to ease understanding? I assume they have types There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will add types, double-check whether these can be removed. |
||
) -> Union[np.ndarray, ABCExtensionArray]: | ||
""" | ||
Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. | ||
|
@@ -256,10 +266,11 @@ def comparison_op( | |
|
||
elif is_scalar(rvalues) and isna(rvalues): | ||
# numpy does not like comparisons vs None | ||
# TODO: Should we be using invalid_comparison here? | ||
if op is operator.ne: | ||
res_values = np.ones(len(lvalues), dtype=bool) | ||
res_values = np.ones(lvalues.shape, dtype=bool) | ||
else: | ||
res_values = np.zeros(len(lvalues), dtype=bool) | ||
res_values = np.zeros(lvalues.shape, dtype=bool) | ||
|
||
elif is_object_dtype(lvalues.dtype): | ||
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) | ||
|
@@ -323,7 +334,7 @@ def na_logical_op(x: np.ndarray, y, op): | |
|
||
|
||
def logical_op( | ||
left: Union[np.ndarray, ABCExtensionArray], right: Any, op | ||
left: Union[np.ndarray, ABCExtensionArray], right: Any, op, str_rep, eval_kwargs | ||
) -> Union[np.ndarray, ABCExtensionArray]: | ||
""" | ||
Evaluate a logical operation `|`, `&`, or `^`. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,8 @@ def test_from_sequence_invalid_type(self): | |
def test_only_1dim_accepted(self): | ||
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") | ||
|
||
with pytest.raises(ValueError, match="Only 1-dimensional"): | ||
# 2-dim | ||
DatetimeArray(arr.reshape(2, 2)) | ||
# 2-dim allowed for ops compat | ||
DatetimeArray(arr.reshape(2, 2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we are to make any assertion about the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure, will update |
||
|
||
with pytest.raises(ValueError, match="Only 1-dimensional"): | ||
# 0-dim | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Presumably this is going to be applied to other methods (else you'd just do the reshaping in _add_offset)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I implemented this at a stage when it was needed in several places (for e.g. getting
__repr__
to work in debugging), then trimmed the usages back to the bare minimum. So we could get by without it now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, OK good to know. Let's hear what others prefer (my slight preference is to inline it in the one place it's used).