Skip to content

PERF: avoid object conversion in fillna(method=pad|backfill) for masked arrays #39953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 5, 2021
46 changes: 37 additions & 9 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
date_range,
isnull,
period_range,
timedelta_range,
)

from .pandas_vb_common import tm
Expand Down Expand Up @@ -355,15 +356,42 @@ def time_isnull_obj(self):

class Fillna:

params = ([True, False], ["pad", "bfill"])
param_names = ["inplace", "method"]

def setup(self, inplace, method):
values = np.random.randn(10000, 100)
values[::2] = np.nan
self.df = DataFrame(values)

def time_frame_fillna(self, inplace, method):
params = (
[True, False],
["pad", "bfill"],
[
"float64",
"float32",
"object",
"Int64",
"Float64",
"datetime64[ns]",
"datetime64[ns, tz]",
"timedelta64[ns]",
],
)
param_names = ["inplace", "method", "dtype"]

def setup(self, inplace, method, dtype):
N, M = 10000, 100
if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
data = {
"datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
}
self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)})
self.df[::2] = None
else:
values = np.random.randn(N, M)
values[::2] = np.nan
if dtype == "Int64":
values = values.round()
self.df = DataFrame(values, dtype=dtype)

def time_frame_fillna(self, inplace, method, dtype):
self.df.fillna(inplace=inplace, method=method)


Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ Performance improvements
- Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`)
- Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`)
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
Expand Down
12 changes: 10 additions & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -597,10 +597,11 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):

@cython.boundscheck(False)
@cython.wraparound(False)
def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
uint8_t prev_mask
int lim, fill_count = 0

N = len(values)
Expand All @@ -612,15 +613,18 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
lim = validate_limit(N, limit)

val = values[0]
prev_mask = mask[0]
for i in range(N):
if mask[i]:
if fill_count >= lim:
continue
fill_count += 1
values[i] = val
mask[i] = prev_mask
else:
fill_count = 0
val = values[i]
prev_mask = mask[i]


@cython.boundscheck(False)
Expand Down Expand Up @@ -739,10 +743,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:

@cython.boundscheck(False)
@cython.wraparound(False)
def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
uint8_t prev_mask
int lim, fill_count = 0

N = len(values)
Expand All @@ -754,15 +759,18 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
lim = validate_limit(N, limit)

val = values[N - 1]
prev_mask = mask[N - 1]
for i in range(N - 1, -1, -1):
if mask[i]:
if fill_count >= lim:
continue
fill_count += 1
values[i] = val
mask[i] = prev_mask
else:
fill_count = 0
val = values[i]
prev_mask = mask[i]


@cython.boundscheck(False)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def fillna(
if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values = func(self._ndarray.copy(), limit=limit, mask=mask)
new_values, _ = func(self._ndarray.copy(), limit=limit, mask=mask)
# TODO: PandasArray didn't used to copy, need tests for this
new_values = self._from_backing_data(new_values)
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,7 +702,7 @@ def fillna(self, value=None, method=None, limit=None):
if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values = func(self.astype(object), limit=limit, mask=mask)
new_values, _ = func(self.astype(object), limit=limit, mask=mask)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
Expand Down
40 changes: 39 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
cache_readonly,
doc,
)
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
Expand All @@ -38,12 +39,16 @@
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import (
isna,
notna,
)

from pandas.core import nanops
from pandas.core import (
missing,
nanops,
)
from pandas.core.algorithms import (
factorize_array,
isin,
Expand Down Expand Up @@ -144,6 +149,39 @@ def __getitem__(

return type(self)(self._data[item], self._mask[item])

@doc(ExtensionArray.fillna)
def fillna(
self: BaseMaskedArrayT, value=None, method=None, limit=None
) -> BaseMaskedArrayT:
value, method = validate_fillna_kwargs(value, method)

mask = self._mask

if is_array_like(value):
if len(value) != len(self):
raise ValueError(
f"Length of 'value' does not match. Got ({len(value)}) "
f" expected {len(self)}"
)
value = value[mask]

if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values, new_mask = func(
self._data.copy(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExtensionBlock.interpolate creates another copy L1774. if we always copy in EA.fillna then that copy shouldn't be needed.

limit=limit,
mask=mask.copy(),
)
return type(self)(new_values, new_mask.view(np.bool_))
else:
# fill with value
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
return new_values

def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
raise AbstractMethodError(self)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def fillna(self, value=None, method=None, limit=None):
if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values = func(self.to_numpy(object), limit=limit, mask=mask)
new_values, _ = func(self.to_numpy(object), limit=limit, mask=mask)
new_values = self._from_sequence(new_values)
else:
# fill with value
Expand Down
7 changes: 2 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1727,16 +1727,13 @@ def _slice(self, slicer):
def fillna(
self, value, limit=None, inplace: bool = False, downcast=None
) -> List[Block]:
values = self.values if inplace else self.values.copy()
values = values.fillna(value=value, limit=limit)
values = self.values.fillna(value=value, limit=limit)
return [self.make_block_same_class(values=values)]

def interpolate(
self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
):

values = self.values if inplace else self.values.copy()
new_values = values.fillna(value=fill_value, method=method, limit=limit)
new_values = self.values.fillna(value=fill_value, method=method, limit=limit)
return self.make_block_same_class(new_values)

def diff(self, n: int, axis: int = 1) -> List[Block]:
Expand Down
28 changes: 18 additions & 10 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,9 +660,9 @@ def interpolate_2d(
method = clean_fill_method(method)
tvalues = transf(values)
if method == "pad":
result = _pad_2d(tvalues, limit=limit)
result, _ = _pad_2d(tvalues, limit=limit)
else:
result = _backfill_2d(tvalues, limit=limit)
result, _ = _backfill_2d(tvalues, limit=limit)

result = transf(result)
# reshape back
Expand Down Expand Up @@ -698,26 +698,34 @@ def new_func(values, limit=None, mask=None):
# This needs to occur before casting to int64
mask = isna(values)

result = func(values.view("i8"), limit=limit, mask=mask)
return result.view(values.dtype)
result, mask = func(values.view("i8"), limit=limit, mask=mask)
return result.view(values.dtype), mask

return func(values, limit=limit, mask=mask)

return cast(F, new_func)


@_datetimelike_compat
def _pad_1d(values, limit=None, mask=None):
def _pad_1d(
values: np.ndarray,
limit: int | None = None,
mask: np.ndarray | None = None,
) -> tuple[np.ndarray, np.ndarray]:
mask = _fillna_prep(values, mask)
algos.pad_inplace(values, mask, limit=limit)
return values
return values, mask


@_datetimelike_compat
def _backfill_1d(values, limit=None, mask=None):
def _backfill_1d(
values: np.ndarray,
limit: int | None = None,
mask: np.ndarray | None = None,
) -> tuple[np.ndarray, np.ndarray]:
mask = _fillna_prep(values, mask)
algos.backfill_inplace(values, mask, limit=limit)
return values
return values, mask


@_datetimelike_compat
Expand All @@ -729,7 +737,7 @@ def _pad_2d(values, limit=None, mask=None):
else:
# for test coverage
pass
return values
return values, mask


@_datetimelike_compat
Expand All @@ -741,7 +749,7 @@ def _backfill_2d(values, limit=None, mask=None):
else:
# for test coverage
pass
return values
return values, mask


_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4529,7 +4529,7 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit):
fill_f = missing.get_fill_func(method)

mask = missing.mask_missing(result.values, to_replace)
values = fill_f(result.values, limit=limit, mask=mask)
values, _ = fill_f(result.values, limit=limit, mask=mask)

if values.dtype == orig_dtype and inplace:
return
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/extension/base/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ def test_fillna_limit_backfill(self, data_missing):
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
self.assert_series_equal(result, expected)

def test_fillna_no_op_returns_copy(self, data):
data = data[~data.isna()]

valid = data[0]
result = data.fillna(valid)
assert result is not data
self.assert_extension_array_equal(result, data)

result = data.fillna(method="backfill")
assert result is not data
self.assert_extension_array_equal(result, data)

def test_fillna_series(self, data_missing):
fill_value = data_missing[1]
ser = pd.Series(data_missing)
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ def test_fillna_series_method(self):
def test_fillna_limit_backfill(self):
pass

@unsupported_fill
def test_fillna_no_op_returns_copy(self):
pass

@unsupported_fill
def test_fillna_series(self):
pass
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,11 @@ def test_fillna_scalar(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_scalar(data_missing)

@skip_nested
def test_fillna_no_op_returns_copy(self, data):
# Non-scalar "scalar" values.
super().test_fillna_no_op_returns_copy(data)

@skip_nested
def test_fillna_series(self, data_missing):
# Non-scalar "scalar" values.
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,13 @@ def test_fillna_limit_backfill(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
super().test_fillna_limit_backfill(data_missing)

def test_fillna_no_op_returns_copy(self, data, request):
if np.isnan(data.fill_value):
request.node.add_marker(
pytest.mark.xfail(reason="returns array with different fill value")
)
super().test_fillna_no_op_returns_copy(data)

def test_fillna_series_method(self, data_missing):
with tm.assert_produces_warning(PerformanceWarning):
super().test_fillna_limit_backfill(data_missing)
Expand Down