Skip to content

ENH: Rolling mode #36861

Open
Open
@twoertwein

Description

@twoertwein

Is your feature request related to a problem?

I wish I could use pandas to do df.rolling(window_size).mode(). Similar to #12430.

Describe the solution you'd like

All rolling methods in pandas seem to be implemented in cython. I implemented a working draft for rolling mode in cython (I'm new to cython).

Before continuing my implementation, I would like some feedback:

  1. What type should mode expect? I assume that float64 would be best: handles NaNs and non-numeric object could be categorized and converted to floats.
  2. pd.DataFrame.mode returns all possible modes. Is that something we would also expect from a rolling mode? It is probably much faster to return only one possible mode (the current draft could easily be extended for that: replace output with a cvector[clist[float]] and in the end convert it to a np.ndarray).
  3. TODO: implement mode for '_variable' version. If I understand it correctly this is needed for pd.DataFrame.expanding().

Additional context

# cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True
# distutils: language = c++

from libcpp.vector cimport vector as cvector
from libcpp.unordered_map cimport unordered_map
from libcpp.list cimport list as clist
from libcpp.pair cimport pair as cpair
from libc.math cimport isnan

cimport cython
from cython.operator cimport dereference, postincrement, postdecrement

import numpy as np
cimport numpy as np

ctypedef np.float_t DTYPE_FLOAT_t


def rolling_mode_fixed(np.ndarray[DTYPE_FLOAT_t, ndim=1] values, const cython.int window_size, const cython.bint dropna):
    # https://cs.stackexchange.com/a/96768
    cdef:
        cython.int N = values.size
        double NaN = float("NaN")
        np.ndarray[DTYPE_FLOAT_t, ndim=1] output = np.empty(N, dtype=np.float)
        cython.int largest_frequency, nan_count, i, frequency = 0
        double value, old_value
        unordered_map[double, cython.int] get_frequency
        cvector[clist[double]] get_objects = cvector[clist[double]](window_size-1)
        cpair[double, cython.int] pair

    with nogil:
        # pre-allocate + 20% free space
        get_frequency.reserve(<int>(1.2 * window_size))

        # new items always have a frequency of 1
        pair.second = 1

        # set NaN and initialize data structures
        for i in range(window_size):
            output[i] = NaN
            value = values[i]
            if isnan(value):
                # either skip NaNs or count them seperate
                nan_count += 1
                continue

            # insert with frequency 1 or increment
            pair.first = value
            insert_pair = get_frequency.insert(pair)
            if not insert_pair.second:
                postincrement(dereference(insert_pair.first).second)
            frequency = dereference(insert_pair.first).second

            if frequency != 1:
                # remove form current set
                get_objects[frequency - 2].remove(value)

            # add to next set
            get_objects[frequency-1].push_back(value)

            # update largest frequency
            largest_frequency = max(frequency, largest_frequency)

        # populate first valid output
        if (dropna and largest_frequency != 0) or (largest_frequency >= nan_count):
            output[window_size - 1] = get_objects[largest_frequency - 1].front()

        # process
        for i in range(window_size, N):
            value = values[i]
            old_value = values[i - window_size]

            value_isnan = isnan(value)
            old_value_isnan = isnan(old_value)
            if (value == old_value) or (value_isnan and old_value_isnan):
                # no change
                output[i] = output[i-1]
                continue

            # decrement old value
            if old_value_isnan:
                nan_count -= 1
            else:
                # find the existing key and decrement it
                find_pair = get_frequency.find(old_value)
                frequency = dereference(find_pair).second
                get_objects[frequency - 1].remove(old_value)
                if frequency > 1:
                    postdecrement(dereference(find_pair).second)
                    get_objects[frequency - 2].push_back(old_value)
                else:
                    get_frequency.erase(find_pair)

            # increment new value
            if value_isnan:
                nan_count += 1
            else:
                # insert with frequency 1 or increment
                pair.first = value
                insert_pair = get_frequency.insert(pair)
                if not insert_pair.second:
                    postincrement(dereference(insert_pair.first).second)
                    get_objects[dereference(insert_pair.first).second - 2].remove(value)
                get_objects[dereference(insert_pair.first).second - 1].push_back(value)

            # update largest set
            if (largest_frequency != window_size) and not get_objects[
                largest_frequency
            ].empty():
                # increased
                largest_frequency += 1
            elif get_objects[largest_frequency-1].empty():
                # decreased
                largest_frequency -= 1

            # write output
            if (dropna and largest_frequency != 0) or (largest_frequency >= nan_count):
                 output[i] = get_objects[largest_frequency - 1].front()
            else:
                output[i] = NaN

    return output

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions