Skip to content

WIP: PERF: Cythonize fillna #42309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 13 commits into from
19 changes: 17 additions & 2 deletions asv_bench/benchmarks/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,25 @@ def setup(self, inplace):
rng = pd.date_range("1/1/2000", periods=N, freq="min")
data = np.random.randn(N)
data[::2] = np.nan
self.ts = pd.Series(data, index=rng)
self.series = pd.Series(data, index=rng)
self.ts = pd.Series(rng.to_series())
self.ts[::2] = np.datetime64("nat")
self.df = pd.DataFrame(np.random.randn(10 ** 3, 10 ** 3))

def time_fillna(self, inplace):
self.ts.fillna(0.0, inplace=inplace)
self.series.fillna(0.0, inplace=inplace)

def time_fillna_ts(self, inplace):
self.ts.fillna(np.datetime64("2021"), inplace=inplace)

def peakmem_fillna(self, inplace):
self.series.fillna(0.0, inplace=inplace)

def time_fillna_limit(self, inplace):
self.series.fillna(0.0, inplace=inplace, limit=10 ** 5)

def time_fillna_df(self, inplace):
self.df.fillna(0.0, inplace=inplace)

def time_replace(self, inplace):
self.ts.replace(np.nan, 0.0, inplace=inplace)
Expand Down
35 changes: 35 additions & 0 deletions pandas/_libs/algos.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,41 @@ def backfill_2d_inplace(
mask: np.ndarray, # const uint8_t[:, :]
limit=None,
) -> None: ...

# ----------------------------------------------------------------------
# Fillna
# ----------------------------------------------------------------------
# ctypedef fused fillna_t:
# float64_t
# float32_t
# object
# int64_t # Datetime64
# uint16_t # Float 16
# complex64_t
# complex128_t
# ctypedef fused fillna_values_t:
# algos_t
# uint16_t # Float 16
# complex64_t
# complex128_t
def fillna1d(
arr: np.ndarray, # fillna_t[:]
value: object, # fillna_t
limit: int,
inf_as_na: bool = False,
) -> None: ...
def fillna1d_multi_values(
arr: np.ndarray, # fillna_t[:]
value: np.ndarray, # fillna_values_t
limit: int,
inf_as_na: bool = False,
) -> None: ...
def fillna2d(
arr: np.ndarray, # fillna_t[:]
value: object, # fillna_t
limit: int,
inf_as_na: bool = False,
) -> None: ...
def is_monotonic(
arr: np.ndarray, # ndarray[algos_t, ndim=1]
timelike: bool,
Expand Down
199 changes: 199 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ from numpy cimport (
NPY_UINT16,
NPY_UINT32,
NPY_UINT64,
complex64_t,
complex128_t,
float32_t,
float64_t,
int8_t,
Expand All @@ -52,6 +54,10 @@ from pandas._libs.khash cimport (
kh_resize_int64,
khiter_t,
)
from pandas._libs.missing cimport (
checknull,
checknull_old,
)
from pandas._libs.util cimport (
get_nat,
numeric,
Expand All @@ -62,7 +68,13 @@ import pandas._libs.missing as missing
cdef:
float64_t FP_ERR = 1e-13
float64_t NaN = <float64_t>np.NaN
# Numpy Float 16 is actually Uint16 since most compilers don't support halfs
# We use this value in fillna to fill float16 nans
# https://docs.scipy.org/doc/numpy-1.13.0/reference/c-api.coremath.html#half-precision-functions
uint16_t uNaN = np.float16(np.nan).view(np.uint16)
int64_t NPY_NAT = get_nat()
float64_t INF = <float64_t>np.inf
float64_t NEGINF = -INF

cdef enum TiebreakEnumType:
TIEBREAK_AVERAGE
Expand Down Expand Up @@ -832,6 +844,193 @@ def backfill_2d_inplace(algos_t[:, :] values,
pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)


# Fillna logic
# We have our own fused type instead of algos_t
# since we don't need to support types that can't hold NAs(ints, etc)
ctypedef fused fillna_t:
float64_t
float32_t
object
int8_t # Categorical
int16_t # Categorical
int32_t # Categorical
int64_t # Categorical/Datetime64
uint16_t # Float 16
complex64_t
complex128_t


@cython.boundscheck(False)
@cython.wraparound(False)
def fillna1d(fillna_t[:] arr,
fillna_t value,
Py_ssize_t limit,
bint inf_as_na=False
) -> ndarray:
"""
Fills na-like elements inplace for a 1D array

Parameters
----------
arr : ndarray
value : object
The value to use to replace nans
limit : int, default None
The number of elements to fill. If None, fills all NaN values
inf_as_na:
Whether to consider INF and NEGINF as NA
"""
cdef:
Py_ssize_t i, N, lim
Py_ssize_t count=0
fillna_t val
bint result

assert arr.ndim == 1, "'arr' must be 1-D."

N = len(arr)
for i in range(N):
val = arr[i]
if fillna_t is object:
if inf_as_na:
result = checknull_old(val)
else:
result = checknull(val)
elif fillna_t is int64_t:
# Datetime64/Timedelta64
result = val == NPY_NAT
elif fillna_t is uint16_t:
# Float 16
result = val == uNaN
else:
result = val != val
if inf_as_na:
result = result and (val == INF or val == NEGINF)
if result and count < limit:
arr[i] = value
count+=1


@cython.boundscheck(False)
@cython.wraparound(False)
def fillna1d_multi_values(fillna_t[:] arr,
algos_t[:] value,
Py_ssize_t limit,
bint inf_as_na=False
) -> ndarray:
"""
Fills na-like elements inplace for a 1D array

Parameters
----------
arr : ndarray
value : ndarray/ExtensionArray
A ndarray/ExtensionArray with same length as arr
describing which fill value to use at each position,
with a value of np.nan indicating that a position should
not be filled
limit : int, default None
The number of elements to fill. If None, fills all NaN values
inf_as_na:
Whether to consider INF and NEGINF as NA
"""
cdef:
Py_ssize_t i, N
Py_ssize_t count=0
fillna_t val
algos_t fill_value
bint result

assert arr.ndim == 1, "'arr' must be 1-D."

N = len(arr)
for i in range(N):
fill_value = value[i]
if algos_t is object or algos_t is float64_t or algos_t is float32_t:
if fill_value != fill_value:
# np.nan don't fill
continue
val = arr[i]
if fillna_t is object:
if inf_as_na:
result = checknull_old(val)
else:
result = checknull(val)
elif fillna_t is int64_t:
# Datetime64/Timedelta64
result = val == NPY_NAT
elif fillna_t is uint16_t:
# Float 16
result = val == uNaN
else:
result = val != val
if inf_as_na:
result = result and (val == INF or val == NEGINF)
if result and count < limit:
# Ugh... We have to cast here since technically could have a int64->float32
# There shouldn't be any risk here since BlockManager should check
# that the element can be held
arr[i] = <fillna_t>fill_value
count+=1


@cython.boundscheck(False)
@cython.wraparound(False)
def fillna2d(fillna_t[:, :] arr,
fillna_t value,
Py_ssize_t limit,
bint inf_as_na=False
) -> ndarray:
"""
Fills na-like elements inplace for a 2D array

Parameters
----------
arr : ndarray
value : object
The value to use to replace nans
limit : int, default None
The number of elements to fill. If None, fills all NaN values
inf_as_na:
Whether to consider INF and NEGINF as NA
"""
cdef:
Py_ssize_t i, j, n, m
Py_ssize_t count=0
fillna_t val
bint result

assert arr.ndim == 2, "'arr' must be 2-D."

n, m = (<object>arr).shape
if inf_as_na:
check_func = checknull_old
else:
check_func = checknull
for i in range(n):
count = 0 # Limit is per axis
for j in range(m):
val = arr[i, j]
if fillna_t is object:
if inf_as_na:
result = checknull_old(val)
else:
result = checknull(val)
elif fillna_t is int64_t:
# Datetime64/Timedelta64
result = val == NPY_NAT
elif fillna_t is uint16_t:
# Float 16
result = val == uNaN
else:
result = val != val
if inf_as_na:
result = result and (val == INF or val == NEGINF)
if result and count < limit:
arr[i, j] = value
count+=1


@cython.boundscheck(False)
@cython.wraparound(False)
def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
Expand Down
Loading