Skip to content

ENH: implement TimedeltaArray/TimedeltaIIndex sum, median, std #28165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 8, 2019
58 changes: 58 additions & 0 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
precision_from_unit,
)
import pandas.compat as compat
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -41,6 +42,7 @@
)
from pandas.core.dtypes.missing import isna

from pandas.core import nanops
from pandas.core.algorithms import checked_add_with_arr
import pandas.core.common as com
from pandas.core.ops.invalid import invalid_comparison
Expand Down Expand Up @@ -384,6 +386,62 @@ def astype(self, dtype, copy=True):
return self
return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)

def sum(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Annotations would be nice for new developments, especially where reasonably easy to add

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like mypy starts complaining about index.py if i add types here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's it saying?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

$ mypy pandas/core/arrays/timedeltas.py
pandas/core/indexes/base.py:257: error: Missing return statement

self,
axis=None,
dtype=None,
out=None,
keepdims: bool = False,
initial=None,
skipna: bool = True,
min_count: int = 0,
):
nv.validate_sum(
(), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
)
if not len(self):
return NaT
if not skipna and self._hasnans:
return NaT

result = nanops.nansum(
self._data, axis=axis, skipna=skipna, min_count=min_count
)
return Timedelta(result)

def std(
self,
axis=None,
dtype=None,
out=None,
ddof: int = 1,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_stat_ddof_func(
(), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std"
)
if not len(self):
return NaT
if not skipna and self._hasnans:
return NaT

result = nanops.nanstd(self._data, axis=axis, skipna=skipna, ddof=ddof)
return Timedelta(result)

def median(
self,
axis=None,
out=None,
overwrite_input: bool = False,
keepdims: bool = False,
skipna: bool = True,
):
nv.validate_median(
(), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims)
)
return nanops.nanmedian(self._data, axis=axis, skipna=skipna)

# ----------------------------------------------------------------
# Rendering Methods

Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pandas.core.indexes.datetimelike import (
DatetimeIndexOpsMixin,
DatetimelikeDelegateMixin,
ea_passthrough,
)
from pandas.core.indexes.numeric import Int64Index
from pandas.core.ops import get_op_result_name
Expand Down Expand Up @@ -173,6 +174,9 @@ def _join_i8_wrapper(joinf, **kwargs):
_datetimelike_ops = TimedeltaArray._datetimelike_ops
_datetimelike_methods = TimedeltaArray._datetimelike_methods
_other_ops = TimedeltaArray._other_ops
sum = ea_passthrough(TimedeltaArray.sum)
std = ea_passthrough(TimedeltaArray.std)
median = ea_passthrough(TimedeltaArray.median)

# -------------------------------------------------------------------
# Constructors
Expand Down
98 changes: 93 additions & 5 deletions pandas/tests/arrays/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ def test_setitem_objects(self, obj):


class TestReductions:
@pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"])
@pytest.mark.parametrize("skipna", [True, False])
def test_reductions_empty(self, name, skipna):
tdi = pd.TimedeltaIndex([])
arr = tdi.array

result = getattr(tdi, name)(skipna=skipna)
assert result is pd.NaT

result = getattr(arr, name)(skipna=skipna)
assert result is pd.NaT

def test_min_max(self):
arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"])

Expand All @@ -160,11 +172,87 @@ def test_min_max(self):
result = arr.max(skipna=False)
assert result is pd.NaT

@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna):
arr = TimedeltaArray._from_sequence([])
result = arr.min(skipna=skipna)
def test_sum(self):
tdi = pd.TimedeltaIndex(["3H", "3H", "NaT", "2H", "5H", "4H"])
arr = tdi.array

result = arr.sum(skipna=True)
expected = pd.Timedelta(hours=17)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = tdi.sum(skipna=True)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = arr.sum(skipna=False)
assert result is pd.NaT

result = tdi.sum(skipna=False)
assert result is pd.NaT

result = arr.sum(min_count=9)
assert result is pd.NaT

result = tdi.sum(min_count=9)
assert result is pd.NaT

result = arr.sum(min_count=1)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = tdi.sum(min_count=1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The min_count was mainly added for the empty case, and for this it does not seem to work:

In [27]: tdi = pd.TimedeltaIndex([])                                                                                                                                                                               

In [28]: tdi.sum()                                                                                                                                                                                                 
Out[28]: NaT

In [29]: tdi.sum(min_count=1)                                                                                                                                                                                      
Out[29]: NaT

In [30]: tdi.sum(min_count=0)                                                                                                                                                                                      
Out[30]: NaT

If we follow the same behaviour as for numerical sum, the result should be 0 instead of NaT

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right

assert isinstance(result, pd.Timedelta)
assert result == expected

def test_npsum(self):
# GH#25335 np.sum should return a Timedelta, not timedelta64
tdi = pd.TimedeltaIndex(["3H", "3H", "2H", "5H", "4H"])
arr = tdi.array

result = np.sum(tdi)
expected = pd.Timedelta(hours=17)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = np.sum(arr)
assert isinstance(result, pd.Timedelta)
assert result == expected

def test_std(self):
tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"])
arr = tdi.array

result = arr.std(skipna=True)
expected = pd.Timedelta(hours=2)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = tdi.std(skipna=True)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = arr.std(skipna=False)
assert result is pd.NaT

result = tdi.std(skipna=False)
assert result is pd.NaT

def test_median(self):
tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"])
arr = tdi.array

result = arr.median(skipna=True)
expected = pd.Timedelta(hours=2)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = tdi.median(skipna=True)
assert isinstance(result, pd.Timedelta)
assert result == expected

result = arr.std(skipna=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you leave these here on purpose (as this tests median)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like a typo, will fix

assert result is pd.NaT

result = arr.max(skipna=skipna)
result = tdi.std(skipna=False)
assert result is pd.NaT