pandas-dev · jreback · Sep 14, 2021 · Aug 31, 2021 · Aug 31, 2021 · Aug 31, 2021
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -180,6 +180,33 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
         self.roll.quantile(percentile, interpolation=interpolation)
 
 
+class Rank:
+    params = (
+        ["DataFrame", "Series"],
+        [10, 1000],
+        ["int", "float"],
+        [True, False],
+        [True, False],
+        ["min", "max", "average"],
+    )
+    param_names = [
+        "constructor",
+        "window",
+        "dtype",
+        "percentile",
+        "ascending",
+        "method",
+    ]
+
+    def setup(self, constructor, window, dtype, percentile, ascending, method):
+        N = 10 ** 5
+        arr = np.random.random(N).astype(dtype)
+        self.roll = getattr(pd, constructor)(arr).rolling(window)
+
+    def time_rank(self, constructor, window, dtype, percentile, ascending, method):
+        self.roll.rank(pct=percentile, ascending=ascending, method=method)
+
+
 class PeakMemFixedWindowMinMax:
 
     params = ["min", "max"]

diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
@@ -35,6 +35,7 @@ Rolling window functions
    Rolling.aggregate
    Rolling.quantile
    Rolling.sem
+   Rolling.rank
 
 .. _api.functions_window:
 
@@ -75,6 +76,7 @@ Expanding window functions
    Expanding.aggregate
    Expanding.quantile
    Expanding.sem
+   Expanding.rank
 
 .. _api.functions_ewm:
 

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -87,6 +87,53 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow
 :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
 with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
 
+.. _whatsnew_140.enhancements.window_rank:
+
+Rank function for rolling and expanding windows
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Added ``rank`` function to :class:`Rolling` and :class:`Expanding`. The new function supports the ``method``, ``ascending``, and ``pct`` flags of :meth:`DataFrame.rank`. The ``method`` argument supports ``min``, ``max``, and ``average`` ranking methods.
+Example:
+
+.. ipython:: python
+
+    >>> s = pd.Series([1, 4, 2, 3, 5, 3])
+    >>> s.rolling(3).rank()
+    0    NaN
+    1    NaN
+    2    2.0
+    3    2.0
+    4    3.0
+    5    1.5
+    dtype: float64
+
+    >>> s.rolling(3).rank(method="max")
+    0    NaN
+    1    NaN
+    2    2.0
+    3    2.0
+    4    3.0
+    5    2.0
+    dtype: float64
+
+    >>> s.expanding().rank()
+    0    1.0
+    1    2.0
+    2    2.0
+    3    3.0
+    4    5.0
+    5    3.5
+    dtype: float64
+
+    >>> s.expanding().rank(method="max")
+    0    1.0
+    1    2.0
+    2    2.0
+    3    3.0
+    4    5.0
+    5    4.0
+    dtype: float64
+
 .. _whatsnew_140.enhancements.other:
 
 Other enhancements

diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h
@@ -180,10 +180,28 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
     return node->value;
 }
 
+PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
+    node_t *node;
+    int level, rank = 0;
+
+    node = skp->head;
+    for (level = skp->maxlevels - 1; level >= 0; --level) {
+        while (_node_cmp(node->next[level], value) > 0) {
+            rank += node->width[level];
+            node = node->next[level];
+        }
+    }
+
+    return rank + 1;
+}
+
+// Returns the rank of the inserted element. When there are duplicates,
+// `rank` is the highest of the group, i.e. the 'max' method of
+// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
 PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
     node_t *node, *prevnode, *newnode, *next_at_level;
     int *steps_at_level;
-    int size, steps, level;
+    int size, steps, level, rank = 0;
     node_t **chain;
 
     chain = skp->tmp_chain;
@@ -197,6 +215,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
         next_at_level = node->next[level];
         while (_node_cmp(next_at_level, value) >= 0) {
             steps_at_level[level] += node->width[level];
+            rank += node->width[level];
             node = next_at_level;
             next_at_level = node->next[level];
         }
@@ -230,7 +249,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
 
     ++(skp->size);
 
-    return 1;
+    return rank + 1;
 }
 
 PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {

diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
@@ -63,6 +63,15 @@ def roll_quantile(
     quantile: float,  # float64_t
     interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
 ) -> np.ndarray: ...  # np.ndarray[float]
+def roll_rank(
+    values: np.ndarray,
+    start: np.ndarray,
+    end: np.ndarray,
+    minp: int,
+    percentile: bool,
+    method: Literal["average", "min", "max"],
+    ascending: bool,
+) -> np.ndarray: ...  # np.ndarray[float]
 def roll_apply(
     obj: object,
     start: np.ndarray,  # np.ndarray[np.int64]

diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -50,6 +50,8 @@ cdef extern from "../src/skiplist.h":
     double skiplist_get(skiplist_t*, int, int*) nogil
     int skiplist_insert(skiplist_t*, double) nogil
     int skiplist_remove(skiplist_t*, double) nogil
+    int skiplist_rank(skiplist_t*, double) nogil
+    int skiplist_min_rank(skiplist_t*, double) nogil
 
 cdef:
     float32_t MINfloat32 = np.NINF
@@ -795,7 +797,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
                     val = values[j]
                     if notnan(val):
                         nobs += 1
-                        err = skiplist_insert(sl, val) != 1
+                        err = skiplist_insert(sl, val) == -1
                         if err:
                             break
 
@@ -806,7 +808,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
                     val = values[j]
                     if notnan(val):
                         nobs += 1
-                        err = skiplist_insert(sl, val) != 1
+                        err = skiplist_insert(sl, val) == -1
                         if err:
                             break
 
@@ -1139,6 +1141,120 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
     return output
 
 
+cdef enum RankType:
+    AVERAGE,
+    MIN,
+    MAX,
+
+
+rank_types = {
+    'average': AVERAGE,
+    'min': MIN,
+    'max': MAX,
+}
+
+
+def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
+              ndarray[int64_t] end, int64_t minp, bint percentile,
+              str method, bint ascending) -> np.ndarray:
+    """
+    O(N log(window)) implementation using skip list
+
+    derived from roll_quantile
+    """
+    cdef:
+        Py_ssize_t i, j, s, e, N = len(values), idx
+        float64_t rank_min = 0, rank = 0
+        int64_t nobs = 0, win
+        float64_t val
+        skiplist_t *skiplist
+        float64_t[::1] output = None
-        float64_t[::1] output = None
+        float64_t[::1] output
-        float64_t[::1] output = None
+        float64_t[::1] output
+        RankType rank_type
+
+    try:
+        rank_type = rank_types[method]
+    except KeyError:
+        raise ValueError(f"Method '{method}' is not supported")
+
+    is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
+        start, end
+    )
+    # we use the Fixed/Variable Indexer here as the
+    # actual skiplist ops outweigh any window computation costs
+    output = np.empty(N, dtype=np.float64)
+
+    win = (end - start).max()
+    if win == 0:
+        output[:] = NaN
+        return np.asarray(output)
+    skiplist = skiplist_init(<int>win)
+    if skiplist == NULL:
+        raise MemoryError("skiplist_init failed")
+
+    with nogil:
+        for i in range(N):
+            s = start[i]
+            e = end[i]
+
+            if i == 0 or not is_monotonic_increasing_bounds:
+                if not is_monotonic_increasing_bounds:
+                    nobs = 0
+                    skiplist_destroy(skiplist)
+                    skiplist = skiplist_init(<int>win)
+
+                # setup
+                for j in range(s, e):
+                    val = values[j] if ascending else -values[j]
+                    if notnan(val):
+                        nobs += 1
+                        rank = skiplist_insert(skiplist, val)
+                        if rank == -1:
+                            raise MemoryError("skiplist_insert failed")
+                        if rank_type == AVERAGE:
+                            rank_min = skiplist_min_rank(skiplist, val)
+                            rank = (((rank * (rank + 1) / 2)
+                                    - ((rank_min - 1) * rank_min / 2))
+                                    / (rank - rank_min + 1))
+                        elif rank_type == MIN:
+                            rank = skiplist_min_rank(skiplist, val)
+                    else:
+                        rank = NaN
+
+            else:
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    val = values[j] if ascending else -values[j]
+                    if notnan(val):
+                        skiplist_remove(skiplist, val)
+                        nobs -= 1
+
+                # calculate adds
+                for j in range(end[i - 1], e):
+                    val = values[j] if ascending else -values[j]
+                    if notnan(val):
+                        nobs += 1
+                        rank = skiplist_insert(skiplist, val)
+                        if rank == -1:
+                            raise MemoryError("skiplist_insert failed")
+                        if rank_type == AVERAGE:
+                            rank_min = skiplist_min_rank(skiplist, val)
+                            rank = (((rank * (rank + 1) / 2)
+                                    - ((rank_min - 1) * rank_min / 2))
+                                    / (rank - rank_min + 1))
+                        elif rank_type == MIN:
+                            rank = skiplist_min_rank(skiplist, val)
+                    else:
+                        rank = NaN
+            if nobs >= minp:
+                output[i] = <float64_t>(rank) / nobs if percentile else rank
+            else:
+                output[i] = NaN
+
+    skiplist_destroy(skiplist)
+
+    return np.asarray(output)
+
+
 def roll_apply(object obj,
                ndarray[int64_t] start, ndarray[int64_t] end,
                int64_t minp,

diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
@@ -564,6 +564,81 @@ def quantile(
             **kwargs,
         )
 
+    @doc(
+        template_header,
+        ".. versionadded:: 1.4.0 \n\n",
+        create_section_header("Parameters"),
+        dedent(
+            """
+        method : {{'average', 'min', 'max'}}, default 'average'
+            How to rank the group of records that have the same value (i.e. ties):
+
+            * average: average rank of the group
+            * min: lowest rank in the group
+            * max: highest rank in the group
+
+        ascending : bool, default True
+            Whether or not the elements should be ranked in ascending order.
+        pct : bool, default False
+            Whether or not to display the returned rankings in percentile
+            form.
+        """
+        ).replace("\n", "", 1),
+        kwargs_compat,
+        create_section_header("Returns"),
+        template_returns,
+        create_section_header("See Also"),
+        template_see_also,
+        create_section_header("Examples"),
+        dedent(
+            """
+        >>> s = pd.Series([1, 4, 2, 3, 5, 3])
+        >>> s.expanding().rank()
+        0    1.0
+        1    2.0
+        2    2.0
+        3    3.0
+        4    5.0
+        5    3.5
+        dtype: float64
+
+        >>> s.expanding().rank(method="max")
+        0    1.0
+        1    2.0
+        2    2.0
+        3    3.0
+        4    5.0
+        5    4.0
+        dtype: float64
+
+        >>> s.expanding().rank(method="min")
+        0    1.0
+        1    2.0
+        2    2.0
+        3    3.0
+        4    5.0
+        5    3.0
+        dtype: float64
+        """
+        ).replace("\n", "", 1),
+        window_method="expanding",
+        aggregation_description="rank",
+        agg_method="rank",
+    )
+    def rank(
+        self,
+        method: str = "average",
+        ascending: bool = True,
+        pct: bool = False,
+        **kwargs,
+    ):
+        return super().rank(
+            method=method,
+            ascending=ascending,
+            pct=pct,
+            **kwargs,
+        )
+
     @doc(
         template_header,
         create_section_header("Parameters"),