Skip to content

BUG: ewma() weights incorrect when some values are missing #7603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ API changes

rolling_min(s, window=10, min_periods=5)

- :func:`ewma`, :func:`ewmastd`, :func:`ewmavar`, :func:`ewmacorr`, and :func:`ewmacov`
now have an optional ``ignore_na`` argument.
When ``ignore_na = Flase`` (the default), missing values are taken into account in the weights calculation.
When ``ignore_na = True`` (which reproduces the pre-0.15.0 behavior), missing values are ignored in the weights calculation.
(:issue:`7603`)

.. ipython:: python

ewma(Series([None, 1., 100.]), com=2.5)
ewma(Series([1., None, 100.]), com=2.5, ignore_na=True) # pre-0.15.0 behavior
ewma(Series([1., None, 100.]), com=2.5, ignore_na=False) # default

- Bug in passing a ``DatetimeIndex`` with a timezone that was not being retained in DataFrame construction from a dict (:issue:`7822`)

In prior versions this would drop the timezone.
Expand Down
44 changes: 18 additions & 26 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -979,14 +979,16 @@ def roll_mean(ndarray[double_t] input,
#-------------------------------------------------------------------------------
# Exponentially weighted moving average

def ewma(ndarray[double_t] input, double_t com, int adjust):
def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na):
'''
Compute exponentially-weighted moving average using center-of-mass.

Parameters
----------
input : ndarray (float64 type)
com : float64
adjust: int
ignore_na: int

Returns
-------
Expand All @@ -1002,37 +1004,27 @@ def ewma(ndarray[double_t] input, double_t com, int adjust):
if N == 0:
return output

neww = 1. / (1. + com)
oldw = 1. - neww
adj = oldw
alpha = 1. / (1. + com)
old_wt_factor = 1. - alpha
new_wt = 1.0 if adjust else alpha

if adjust:
output[0] = neww * input[0]
else:
output[0] = input[0]
output[0] = input[0]
weighted_avg = output[0]
old_wt = 1.

for i from 1 <= i < N:
cur = input[i]
prev = output[i - 1]

if cur == cur:
if prev == prev:
output[i] = oldw * prev + neww * cur
else:
output[i] = neww * cur
if weighted_avg == weighted_avg:
if cur == cur:
old_wt *= old_wt_factor
weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt)
old_wt += new_wt
elif not ignore_na:
old_wt *= old_wt_factor
else:
output[i] = prev

if adjust:
for i from 0 <= i < N:
cur = input[i]
weighted_avg = cur

if cur == cur:
output[i] = output[i] / (1. - adj)
adj *= oldw
else:
if i >= 1:
output[i] = output[i - 1]
output[i] = weighted_avg

return output

Expand Down
30 changes: 18 additions & 12 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@
imbalance in relative weightings (viewing EWMA as a moving average)
how : string, default 'mean'
Method for down- or re-sampling
ignore_na : boolean, default False
Ignore missing values when calculating weights;
specify True to reproduce pre-0.15.0 behavior
"""

_ewm_notes = r"""
Expand Down Expand Up @@ -420,12 +423,12 @@ def _get_center_of_mass(com, span, halflife):
_type_of_input_retval, _ewm_notes)
@Appender(_doc_template)
def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None,
adjust=True, how=None):
adjust=True, how=None, ignore_na=False):
com = _get_center_of_mass(com, span, halflife)
arg = _conv_timerule(arg, freq, how)

def _ewma(v):
result = algos.ewma(v, com, int(adjust))
result = algos.ewma(v, com, int(adjust), int(ignore_na))
first_index = _first_valid_index(v)
result[first_index: first_index + min_periods] = NaN
return result
Expand All @@ -444,11 +447,11 @@ def _first_valid_index(arr):
_ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes)
@Appender(_doc_template)
def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False,
freq=None, how=None):
freq=None, how=None, ignore_na=False):
com = _get_center_of_mass(com, span, halflife)
arg = _conv_timerule(arg, freq, how)
moment2nd = ewma(arg * arg, com=com, min_periods=min_periods)
moment1st = ewma(arg, com=com, min_periods=min_periods)
moment2nd = ewma(arg * arg, com=com, min_periods=min_periods, ignore_na=ignore_na)
moment1st = ewma(arg, com=com, min_periods=min_periods, ignore_na=ignore_na)

result = moment2nd - moment1st ** 2
if not bias:
Expand All @@ -460,9 +463,10 @@ def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False,
@Substitution("Exponentially-weighted moving std", _unary_arg,
_ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes)
@Appender(_doc_template)
def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False):
def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False,
ignore_na=False):
result = ewmvar(arg, com=com, span=span, halflife=halflife,
min_periods=min_periods, bias=bias)
min_periods=min_periods, bias=bias, ignore_na=ignore_na)
return _zsqrt(result)

ewmvol = ewmstd
Expand All @@ -472,7 +476,7 @@ def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False):
_ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes)
@Appender(_doc_template)
def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
bias=False, freq=None, pairwise=None, how=None):
bias=False, freq=None, pairwise=None, how=None, ignore_na=False):
if arg2 is None:
arg2 = arg1
pairwise = True if pairwise is None else pairwise
Expand All @@ -484,7 +488,8 @@ def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
arg2 = _conv_timerule(arg2, freq, how)

def _get_ewmcov(X, Y):
mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods)
mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods,
ignore_na=ignore_na)
return (mean(X * Y) - mean(X) * mean(Y))
result = _flex_binary_moment(arg1, arg2, _get_ewmcov,
pairwise=bool(pairwise))
Expand All @@ -499,7 +504,7 @@ def _get_ewmcov(X, Y):
_ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes)
@Appender(_doc_template)
def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
freq=None, pairwise=None, how=None):
freq=None, pairwise=None, how=None, ignore_na=False):
if arg2 is None:
arg2 = arg1
pairwise = True if pairwise is None else pairwise
Expand All @@ -511,9 +516,10 @@ def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
arg2 = _conv_timerule(arg2, freq, how)

def _get_ewmcorr(X, Y):
mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods)
mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods,
ignore_na=ignore_na)
var = lambda x: ewmvar(x, com=com, span=span, halflife=halflife, min_periods=min_periods,
bias=True)
bias=True, ignore_na=ignore_na)
return (mean(X * Y) - mean(X) * mean(Y)) / _zsqrt(var(X) * var(Y))
result = _flex_binary_moment(arg1, arg2, _get_ewmcorr,
pairwise=bool(pairwise))
Expand Down
55 changes: 54 additions & 1 deletion pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,11 +520,64 @@ def test_ewma(self):
result = mom.ewma(arr, span=100, adjust=False).sum()
self.assertTrue(np.abs(result - 1) < 1e-2)

s = Series([1.0, 2.0, 4.0, 8.0])

expected = Series([1.0, 1.6, 2.736842, 4.923077])
for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=True),
]:
result = f(s)
assert_series_equal(result, expected)

expected = Series([1.0, 1.333333, 2.222222, 4.148148])
for f in [lambda s: mom.ewma(s, com=2.0, adjust=False),
lambda s: mom.ewma(s, com=2.0, adjust=False, ignore_na=False),
lambda s: mom.ewma(s, com=2.0, adjust=False, ignore_na=True),
]:
result = f(s)
assert_series_equal(result, expected)

def test_ewma_nan_handling(self):
s = Series([1.] + [np.nan] * 5 + [1.])
result = mom.ewma(s, com=5)
assert_almost_equal(result, [1.] * len(s))

s = Series([np.nan] * 2 + [1.] + [np.nan] * 2 + [1.])
result = mom.ewma(s, com=5)
assert_almost_equal(result, [1] * len(s))
assert_almost_equal(result, [np.nan] * 2 + [1.] * 4)

# GH 7603
s0 = Series([np.nan, 1., 101.])
s1 = Series([1., np.nan, 101.])
s2 = Series([np.nan, 1., np.nan, np.nan, 101., np.nan])
com = 2.
alpha = 1. / (1. + com)

def simple_wma(s, w):
return (s.multiply(w).cumsum() / w.cumsum()).fillna(method='ffill')

for (s, adjust, ignore_na, w) in [
(s0, True, False, [np.nan, (1.0 - alpha), 1.]),
(s0, True, True, [np.nan, (1.0 - alpha), 1.]),
(s0, False, False, [np.nan, (1.0 - alpha), alpha]),
(s0, False, True, [np.nan, (1.0 - alpha), alpha]),
(s1, True, False, [(1.0 - alpha)**2, np.nan, 1.]),
(s1, True, True, [(1.0 - alpha), np.nan, 1.]),
(s1, False, False, [(1.0 - alpha)**2, np.nan, alpha]),
(s1, False, True, [(1.0 - alpha), np.nan, alpha]),
(s2, True, False, [np.nan, (1.0 - alpha)**3, np.nan, np.nan, 1., np.nan]),
(s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1., np.nan]),
(s2, False, False, [np.nan, (1.0 - alpha)**3, np.nan, np.nan, alpha, np.nan]),
(s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]),
]:
expected = simple_wma(s, Series(w))
result = mom.ewma(s, com=com, adjust=adjust, ignore_na=ignore_na)
assert_series_equal(result, expected)
if ignore_na is False:
# check that ignore_na defaults to False
result = mom.ewma(s, com=com, adjust=adjust)
assert_series_equal(result, expected)

def test_ewmvar(self):
self._check_ew(mom.ewmvar)
Expand Down