Skip to content

PERF: avoid object conversion in fillna(method=pad|backfill) for masked arrays #39953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 5, 2021
Merged
24 changes: 18 additions & 6 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,15 +351,27 @@ def time_isnull_obj(self):

class Fillna:

params = ([True, False], ["pad", "bfill"])
param_names = ["inplace", "method"]

def setup(self, inplace, method):
params = (
[True, False],
["pad", "bfill"],
[
"float64",
"float32",
"object",
"Int64",
"Float64",
],
)
param_names = ["inplace", "method", "dtype"]

def setup(self, inplace, method, dtype):
values = np.random.randn(10000, 100)
values[::2] = np.nan
self.df = DataFrame(values)
if dtype == "Int64":
values = values.round()
self.df = DataFrame(values, dtype=dtype)

def time_frame_fillna(self, inplace, method):
def time_frame_fillna(self, inplace, method, dtype):
self.df.fillna(inplace=inplace, method=method)


Expand Down
12 changes: 10 additions & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -597,10 +597,11 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):

@cython.boundscheck(False)
@cython.wraparound(False)
def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
uint8_t prev_mask
int lim, fill_count = 0

N = len(values)
Expand All @@ -612,15 +613,18 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
lim = validate_limit(N, limit)

val = values[0]
prev_mask = mask[0]
for i in range(N):
if mask[i]:
if fill_count >= lim:
continue
fill_count += 1
values[i] = val
mask[i] = prev_mask
else:
fill_count = 0
val = values[i]
prev_mask = mask[i]


@cython.boundscheck(False)
Expand Down Expand Up @@ -739,10 +743,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:

@cython.boundscheck(False)
@cython.wraparound(False)
def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
cdef:
Py_ssize_t i, N
algos_t val
uint8_t prev_mask
int lim, fill_count = 0

N = len(values)
Expand All @@ -754,15 +759,18 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
lim = validate_limit(N, limit)

val = values[N - 1]
prev_mask = mask[N - 1]
for i in range(N - 1, -1, -1):
if mask[i]:
if fill_count >= lim:
continue
fill_count += 1
values[i] = val
mask[i] = prev_mask
else:
fill_count = 0
val = values[i]
prev_mask = mask[i]


@cython.boundscheck(False)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def fillna(
if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values = func(self._ndarray.copy(), limit=limit, mask=mask)
new_values, _ = func(self._ndarray.copy(), limit=limit, mask=mask)
# TODO: PandasArray didn't used to copy, need tests for this
new_values = self._from_backing_data(new_values)
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ def fillna(self, value=None, method=None, limit=None):
if mask.any():
if method is not None:
func = get_fill_func(method)
new_values = func(self.astype(object), limit=limit, mask=mask)
new_values, _ = func(self.astype(object), limit=limit, mask=mask)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
Expand Down
40 changes: 39 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
cache_readonly,
doc,
)
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
Expand All @@ -38,12 +39,16 @@
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import (
isna,
notna,
)

from pandas.core import nanops
from pandas.core import (
missing,
nanops,
)
from pandas.core.algorithms import (
factorize_array,
isin,
Expand Down Expand Up @@ -144,6 +149,39 @@ def __getitem__(

return type(self)(self._data[item], self._mask[item])

@doc(ExtensionArray.fillna)
def fillna(
self: BaseMaskedArrayT, value=None, method=None, limit=None
) -> BaseMaskedArrayT:
value, method = validate_fillna_kwargs(value, method)

mask = self._mask

if is_array_like(value):
if len(value) != len(self):
raise ValueError(
f"Length of 'value' does not match. Got ({len(value)}) "
f" expected {len(self)}"
)
value = value[mask]

if mask.any():
if method is not None:
func = missing.get_fill_func(method)
new_values, new_mask = func(
self._data.copy(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExtensionBlock.interpolate creates another copy L1774. if we always copy in EA.fillna then that copy shouldn't be needed.

limit=limit,
mask=mask.copy(),
)
return type(self)(new_values, new_mask.view(np.bool_))
else:
# fill with value
new_values = self.copy()
new_values[mask] = value
else:
new_values = self.copy()
return new_values

def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
raise AbstractMethodError(self)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def fillna(self, value=None, method=None, limit=None):
if mask.any():
if method is not None:
func = get_fill_func(method)
new_values = func(self.to_numpy(object), limit=limit, mask=mask)
new_values, _ = func(self.to_numpy(object), limit=limit, mask=mask)
new_values = self._from_sequence(new_values)
else:
# fill with value
Expand Down
16 changes: 12 additions & 4 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,16 +692,24 @@ def _fillna_prep(values, mask=None):
return values, mask


def _pad_1d(values, limit=None, mask=None):
def _pad_1d(
values: np.ndarray,
limit: int | None = None,
mask: np.ndarray | None = None,
) -> tuple[np.ndarray, np.ndarray]:
values, mask = _fillna_prep(values, mask)
algos.pad_inplace(values, mask, limit=limit)
return values
return values, mask


def _backfill_1d(values, limit=None, mask=None):
def _backfill_1d(
values: np.ndarray,
limit: int | None = None,
mask: np.ndarray | None = None,
) -> tuple[np.ndarray, np.ndarray]:
values, mask = _fillna_prep(values, mask)
algos.backfill_inplace(values, mask, limit=limit)
return values
return values, mask


def _pad_2d(values, limit=None, mask=None):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4515,7 +4515,7 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit):
fill_f = missing.get_fill_func(method)

mask = missing.mask_missing(result.values, to_replace)
values = fill_f(result.values, limit=limit, mask=mask)
values, _ = fill_f(result.values, limit=limit, mask=mask)

if values.dtype == orig_dtype and inplace:
return
Expand Down