Description
Groupby (observed=False
) with a categorical multiIndex and integer data values returns zero for categories that do no appear in the data, as seen in the first example (there are no wild parrots).
import pandas as pd
animals = ['Falcon', 'Parrot']
types = ['Captive', 'Wild']
df = pd.DataFrame({
'animal': pd.Categorical(['Falcon', 'Falcon', 'Parrot', 'Parrot'],
categories=animals),
'type': pd.Categorical(['Captive', 'Wild', 'Captive', 'Captive'],
categories=types),
'time': [1, 2, 3, 4]
})
df.set_index(['animal', 'type'], inplace=True)
df.groupby(level=['animal', 'type'], observed=False).sum()
index | time |
---|---|
('Falcon', 'Captive') | 1 |
('Falcon', 'Wild') | 2 |
('Parrot', 'Captive') | 7 |
('Parrot', 'Wild') | 0 |
But when using Timedelta
data values an int
is returned, instead of a Timedelta.
import pandas as pd
animals = ['Falcon', 'Parrot']
types = ['Captive', 'Wild']
df = pd.DataFrame({
'animal': pd.Categorical(['Falcon', 'Falcon', 'Parrot', 'Parrot'],
categories=animals),
'type': pd.Categorical(['Captive', 'Wild', 'Captive', 'Captive'],
categories=types),
'time': [1, 2, 3, 4]
})
# Convert time to time delta.
df['time'] = pd.to_timedelta(df['time'])
df.set_index(['animal', 'type'], inplace=True)
df.groupby(level=['animal', 'type'], observed=False).sum()
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_106514/2243147148.py in <module>
14 df['time'] = pd.to_timedelta(df['time'])
15 df.set_index(['animal', 'type'], inplace=True)
---> 16 df.groupby(level=['animal', 'type'], observed=False).sum()
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in sum(self, numeric_only, min_count)
1851 )
1852
-> 1853 return self._reindex_output(result, fill_value=0)
1854
1855 @final
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in _reindex_output(self, output, fill_value)
3169 "fill_value": fill_value,
3170 }
-> 3171 return output.reindex(**d)
3172
3173 # GH 13204
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 @wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4816
4817 # perform the reindex on the axes
-> 4818 return self._reindex_axes(
4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4595 index = axes["index"]
4596 if index is not None:
-> 4597 frame = frame._reindex_index(
4598 index, method, copy, level, fill_value, limit, tolerance
4599 )
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
4614 new_index, method=method, level=level, limit=limit, tolerance=tolerance
4615 )
-> 4616 return self._reindex_with_indexers(
4617 {0: [new_index, indexer]},
4618 copy=copy,
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
4881
4882 # TODO: speed up on homogeneous DataFrame objects
-> 4883 new_data = new_data.reindex_indexer(
4884 index,
4885 indexer,
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/internals/managers.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice)
678 )
679 else:
--> 680 new_blocks = [
681 blk.take_nd(
682 indexer,
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/internals/managers.py in <listcomp>(.0)
679 else:
680 new_blocks = [
--> 681 blk.take_nd(
682 indexer,
683 axis=1,
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/internals/blocks.py in take_nd(self, indexer, axis, new_mgr_locs, fill_value)
1143 allow_fill = True
1144
-> 1145 new_values = algos.take_nd(
1146 values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value
1147 )
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/array_algos/take.py in take_nd(arr, indexer, axis, fill_value, allow_fill)
99 # i.e. DatetimeArray, TimedeltaArray
100 arr = cast("NDArrayBackedExtensionArray", arr)
--> 101 return arr.take(
102 indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
103 )
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/arrays/_mixins.py in take(self, indices, allow_fill, fill_value, axis)
95 ) -> NDArrayBackedExtensionArrayT:
96 if allow_fill:
---> 97 fill_value = self._validate_scalar(fill_value)
98
99 new_data = take(
~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/arrays/datetimelike.py in _validate_scalar(self, value, allow_listlike, setitem, unbox)
643 else:
644 msg = self._validation_error_message(value, allow_listlike)
--> 645 raise TypeError(msg)
646
647 if not unbox:
TypeError: value should be a 'Timedelta' or 'NaT'. Got 'int' instead.
pd.show_versions()
INSTALLED VERSIONS
commit : 73c6825
python : 3.9.2.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.0-8-amd64
Version : #1 SMP Debian 5.10.46-5 (2021-09-23)
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : en_GB.UTF-8
LOCALE : en_GB.UTF-8
pandas : 1.3.3
numpy : 1.19.5
pytz : 2021.3
dateutil : 2.8.2
pip : 21.2.4
setuptools : 58.1.0
Cython : 0.29.24
pytest : 6.2.5
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.6.3
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.0.2
IPython : 7.28.0
pandas_datareader: None
bs4 : 4.10.0
bottleneck : 1.3.2
fsspec : 2021.10.0
fastparquet : None
gcsfs : None
matplotlib : 3.4.3
numexpr : 2.7.3
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyxlsb : None
s3fs : None
scipy : 1.7.1
sqlalchemy : 1.3.24
tables : 3.6.1
tabulate : 0.8.9
xarray : 0.19.0
xlrd : 2.0.1
xlwt : None
numba : 0.54.0