Open
Description
Is your feature request related to a problem?
I wish I could use pandas to do df.rolling(window_size).mode()
. Similar to #12430.
Describe the solution you'd like
All rolling methods in pandas seem to be implemented in cython. I implemented a working draft for rolling mode in cython (I'm new to cython).
Before continuing my implementation, I would like some feedback:
- What type should mode expect? I assume that float64 would be best: handles NaNs and non-numeric object could be categorized and converted to floats.
pd.DataFrame.mode
returns all possible modes. Is that something we would also expect from a rolling mode? It is probably much faster to return only one possible mode (the current draft could easily be extended for that: replaceoutput
with a cvector[clist[float]] and in the end convert it to a np.ndarray).- TODO: implement mode for '_variable' version. If I understand it correctly this is needed for
pd.DataFrame.expanding()
.
Additional context
# cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True
# distutils: language = c++
from libcpp.vector cimport vector as cvector
from libcpp.unordered_map cimport unordered_map
from libcpp.list cimport list as clist
from libcpp.pair cimport pair as cpair
from libc.math cimport isnan
cimport cython
from cython.operator cimport dereference, postincrement, postdecrement
import numpy as np
cimport numpy as np
ctypedef np.float_t DTYPE_FLOAT_t
def rolling_mode_fixed(np.ndarray[DTYPE_FLOAT_t, ndim=1] values, const cython.int window_size, const cython.bint dropna):
# https://cs.stackexchange.com/a/96768
cdef:
cython.int N = values.size
double NaN = float("NaN")
np.ndarray[DTYPE_FLOAT_t, ndim=1] output = np.empty(N, dtype=np.float)
cython.int largest_frequency, nan_count, i, frequency = 0
double value, old_value
unordered_map[double, cython.int] get_frequency
cvector[clist[double]] get_objects = cvector[clist[double]](window_size-1)
cpair[double, cython.int] pair
with nogil:
# pre-allocate + 20% free space
get_frequency.reserve(<int>(1.2 * window_size))
# new items always have a frequency of 1
pair.second = 1
# set NaN and initialize data structures
for i in range(window_size):
output[i] = NaN
value = values[i]
if isnan(value):
# either skip NaNs or count them seperate
nan_count += 1
continue
# insert with frequency 1 or increment
pair.first = value
insert_pair = get_frequency.insert(pair)
if not insert_pair.second:
postincrement(dereference(insert_pair.first).second)
frequency = dereference(insert_pair.first).second
if frequency != 1:
# remove form current set
get_objects[frequency - 2].remove(value)
# add to next set
get_objects[frequency-1].push_back(value)
# update largest frequency
largest_frequency = max(frequency, largest_frequency)
# populate first valid output
if (dropna and largest_frequency != 0) or (largest_frequency >= nan_count):
output[window_size - 1] = get_objects[largest_frequency - 1].front()
# process
for i in range(window_size, N):
value = values[i]
old_value = values[i - window_size]
value_isnan = isnan(value)
old_value_isnan = isnan(old_value)
if (value == old_value) or (value_isnan and old_value_isnan):
# no change
output[i] = output[i-1]
continue
# decrement old value
if old_value_isnan:
nan_count -= 1
else:
# find the existing key and decrement it
find_pair = get_frequency.find(old_value)
frequency = dereference(find_pair).second
get_objects[frequency - 1].remove(old_value)
if frequency > 1:
postdecrement(dereference(find_pair).second)
get_objects[frequency - 2].push_back(old_value)
else:
get_frequency.erase(find_pair)
# increment new value
if value_isnan:
nan_count += 1
else:
# insert with frequency 1 or increment
pair.first = value
insert_pair = get_frequency.insert(pair)
if not insert_pair.second:
postincrement(dereference(insert_pair.first).second)
get_objects[dereference(insert_pair.first).second - 2].remove(value)
get_objects[dereference(insert_pair.first).second - 1].push_back(value)
# update largest set
if (largest_frequency != window_size) and not get_objects[
largest_frequency
].empty():
# increased
largest_frequency += 1
elif get_objects[largest_frequency-1].empty():
# decreased
largest_frequency -= 1
# write output
if (dropna and largest_frequency != 0) or (largest_frequency >= nan_count):
output[i] = get_objects[largest_frequency - 1].front()
else:
output[i] = NaN
return output