BUG: rolling.quantile does not return an interpolated result (#16247)

guillemborrell · jreback · commit a43c1576ce3d · 2017-07-10T06:15:08.000-04:00
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -0,0 +1,185 @@
+from .pandas_vb_common import *
+import pandas as pd
+import numpy as np
+
+
+class DataframeRolling(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 100000
+        self.Ns = 10000
+        self.df = pd.DataFrame({'a': np.random.random(self.N)})
+        self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)})
+        self.wins = 10
+        self.winl = 1000
+
+    def time_rolling_quantile_0(self):
+        (self.df.rolling(self.wins).quantile(0.0))
+
+    def time_rolling_quantile_1(self):
+        (self.df.rolling(self.wins).quantile(1.0))
+
+    def time_rolling_quantile_median(self):
+        (self.df.rolling(self.wins).quantile(0.5))
+
+    def time_rolling_median(self):
+        (self.df.rolling(self.wins).median())
+
+    def time_rolling_median(self):
+        (self.df.rolling(self.wins).mean())
+
+    def time_rolling_max(self):
+        (self.df.rolling(self.wins).max())
+
+    def time_rolling_min(self):
+        (self.df.rolling(self.wins).min())
+
+    def time_rolling_std(self):
+        (self.df.rolling(self.wins).std())
+
+    def time_rolling_count(self):
+        (self.df.rolling(self.wins).count())
+
+    def time_rolling_skew(self):
+        (self.df.rolling(self.wins).skew())
+
+    def time_rolling_kurt(self):
+        (self.df.rolling(self.wins).kurt())
+
+    def time_rolling_sum(self):
+        (self.df.rolling(self.wins).sum())
+
+    def time_rolling_corr(self):
+        (self.dfs.rolling(self.wins).corr())
+
+    def time_rolling_cov(self):
+        (self.dfs.rolling(self.wins).cov())
+        
+    def time_rolling_quantile_0_l(self):
+        (self.df.rolling(self.winl).quantile(0.0))
+
+    def time_rolling_quantile_1_l(self):
+        (self.df.rolling(self.winl).quantile(1.0))
+
+    def time_rolling_quantile_median_l(self):
+        (self.df.rolling(self.winl).quantile(0.5))
+
+    def time_rolling_median_l(self):
+        (self.df.rolling(self.winl).median())
+
+    def time_rolling_median_l(self):
+        (self.df.rolling(self.winl).mean())
+
+    def time_rolling_max_l(self):
+        (self.df.rolling(self.winl).max())
+
+    def time_rolling_min_l(self):
+        (self.df.rolling(self.winl).min())
+
+    def time_rolling_std_l(self):
+        (self.df.rolling(self.wins).std())
+
+    def time_rolling_count_l(self):
+        (self.df.rolling(self.wins).count())
+
+    def time_rolling_skew_l(self):
+        (self.df.rolling(self.wins).skew())
+
+    def time_rolling_kurt_l(self):
+        (self.df.rolling(self.wins).kurt())
+
+    def time_rolling_sum_l(self):
+        (self.df.rolling(self.wins).sum())
+
+
+class SeriesRolling(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.N = 100000
+        self.Ns = 10000
+        self.df = pd.DataFrame({'a': np.random.random(self.N)})
+        self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)})
+        self.sr = self.df.a
+        self.srs = self.dfs.a
+        self.wins = 10
+        self.winl = 1000
+
+    def time_rolling_quantile_0(self):
+        (self.sr.rolling(self.wins).quantile(0.0))
+
+    def time_rolling_quantile_1(self):
+        (self.sr.rolling(self.wins).quantile(1.0))
+
+    def time_rolling_quantile_median(self):
+        (self.sr.rolling(self.wins).quantile(0.5))
+
+    def time_rolling_median(self):
+        (self.sr.rolling(self.wins).median())
+
+    def time_rolling_median(self):
+        (self.sr.rolling(self.wins).mean())
+
+    def time_rolling_max(self):
+        (self.sr.rolling(self.wins).max())
+
+    def time_rolling_min(self):
+        (self.sr.rolling(self.wins).min())
+
+    def time_rolling_std(self):
+        (self.sr.rolling(self.wins).std())
+
+    def time_rolling_count(self):
+        (self.sr.rolling(self.wins).count())
+
+    def time_rolling_skew(self):
+        (self.sr.rolling(self.wins).skew())
+
+    def time_rolling_kurt(self):
+        (self.sr.rolling(self.wins).kurt())
+
+    def time_rolling_sum(self):
+        (self.sr.rolling(self.wins).sum())
+
+    def time_rolling_corr(self):
+        (self.srs.rolling(self.wins).corr())
+
+    def time_rolling_cov(self):
+        (self.srs.rolling(self.wins).cov())
+        
+    def time_rolling_quantile_0_l(self):
+        (self.sr.rolling(self.winl).quantile(0.0))
+
+    def time_rolling_quantile_1_l(self):
+        (self.sr.rolling(self.winl).quantile(1.0))
+
+    def time_rolling_quantile_median_l(self):
+        (self.sr.rolling(self.winl).quantile(0.5))
+
+    def time_rolling_median_l(self):
+        (self.sr.rolling(self.winl).median())
+
+    def time_rolling_median_l(self):
+        (self.sr.rolling(self.winl).mean())
+
+    def time_rolling_max_l(self):
+        (self.sr.rolling(self.winl).max())
+
+    def time_rolling_min_l(self):
+        (self.sr.rolling(self.winl).min())
+
+    def time_rolling_std_l(self):
+        (self.sr.rolling(self.wins).std())
+
+    def time_rolling_count_l(self):
+        (self.sr.rolling(self.wins).count())
+
+    def time_rolling_skew_l(self):
+        (self.sr.rolling(self.wins).skew())
+
+    def time_rolling_kurt_l(self):
+        (self.sr.rolling(self.wins).kurt())
+
+    def time_rolling_sum_l(self):
+        (self.sr.rolling(self.wins).sum())
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -168,9 +168,11 @@ Plotting
 
 Groupby/Resample/Rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
-- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`)
 
+- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`)
 - Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`)
+- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`)
+
 
 Sparse
 ^^^^^^
@@ -191,6 +193,7 @@ Categorical
 ^^^^^^^^^^^
 
 
+
 Other
 ^^^^^
 - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`)
diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx
@@ -1348,8 +1348,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,
         bint is_variable
         ndarray[int64_t] start, end
         ndarray[double_t] output
+        double vlow, vhigh
 
-    if quantile < 0.0 or quantile > 1.0:
+    if quantile <= 0.0 or quantile >= 1.0:
         raise ValueError("quantile value {0} not in [0, 1]".format(quantile))
 
     # we use the Fixed/Variable Indexer here as the
@@ -1391,7 +1392,17 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win,
 
         if nobs >= minp:
             idx = int(quantile * <double>(nobs - 1))
-            output[i] = skiplist.get(idx)
+
+            # Single value in skip list
+            if nobs == 1:
+                output[i] = skiplist.get(0)
+
+            # Interpolated quantile
+            else:
+                vlow = skiplist.get(idx)
+                vhigh = skiplist.get(idx + 1)
+                output[i] = (vlow + (vhigh - vlow) *
+                                 (quantile * (nobs - 1) - idx))
         else:
             output[i] = NaN
 
diff --git a/pandas/core/window.py b/pandas/core/window.py
@@ -975,8 +975,15 @@ def quantile(self, quantile, **kwargs):
 
         def f(arg, *args, **kwargs):
             minp = _use_window(self.min_periods, window)
-            return _window.roll_quantile(arg, window, minp, indexi,
-                                         self.closed, quantile)
+            if quantile == 1.0:
+                return _window.roll_max(arg, window, minp, indexi,
+                                        self.closed)
+            elif quantile == 0.0:
+                return _window.roll_min(arg, window, minp, indexi,
+                                        self.closed)
+            else:
+                return _window.roll_quantile(arg, window, minp, indexi,
+                                             self.closed, quantile)
 
         return self._apply(f, 'quantile', quantile=quantile,
                            **kwargs)
diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py
@@ -1122,8 +1122,19 @@ def test_rolling_quantile(self):
         def scoreatpercentile(a, per):
             values = np.sort(a, axis=0)
 
-            idx = per / 1. * (values.shape[0] - 1)
-            return values[int(idx)]
+            idx = int(per / 1. * (values.shape[0] - 1))
+
+            if idx == values.shape[0] - 1:
+                retval = values[-1]
+
+            else:
+                qlow = float(idx) / float(values.shape[0] - 1)
+                qhig = float(idx + 1) / float(values.shape[0] - 1)
+                vlow = values[idx]
+                vhig = values[idx + 1]
+                retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow)
+
+            return retval
 
         for q in qs:
 
@@ -1138,6 +1149,30 @@ def alt(x):
 
             self._check_moment_func(f, alt, name='quantile', quantile=q)
 
+    def test_rolling_quantile_np_percentile(self):
+        # #9413: Tests that rolling window's quantile default behavior
+        # is analogus to Numpy's percentile
+        row = 10
+        col = 5
+        idx = pd.date_range(20100101, periods=row, freq='B')
+        df = pd.DataFrame(np.random.rand(row * col).reshape((row, -1)),
+                          index=idx)
+
+        df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0)
+        np_percentile = np.percentile(df, [25, 50, 75], axis=0)
+
+        tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))
+
+    def test_rolling_quantile_series(self):
+        # #16211: Tests that rolling window's quantile default behavior
+        # is analogus to pd.Series' quantile
+        arr = np.arange(100)
+        s = pd.Series(arr)
+        q1 = s.quantile(0.1)
+        q2 = s.rolling(100).quantile(0.1).iloc[-1]
+
+        tm.assert_almost_equal(q1, q2)
+
     def test_rolling_quantile_param(self):
         ser = Series([0.0, .1, .5, .9, 1.0])
 
@@ -3558,7 +3593,7 @@ def test_ragged_quantile(self):
 
         result = df.rolling(window='2s', min_periods=1).quantile(0.5)
         expected = df.copy()
-        expected['B'] = [0.0, 1, 1.0, 3.0, 3.0]
+        expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
         tm.assert_frame_equal(result, expected)
 
     def test_ragged_std(self):