Skip to content

BUG: Groupby with categorical multiIndex and timedelta returns incorrect type. #43891

Closed
@jamieforth

Description

@jamieforth

Groupby (observed=False) with a categorical multiIndex and integer data values returns zero for categories that do no appear in the data, as seen in the first example (there are no wild parrots).

import pandas as pd

animals = ['Falcon', 'Parrot']
types = ['Captive', 'Wild']

df = pd.DataFrame({
    'animal': pd.Categorical(['Falcon', 'Falcon', 'Parrot', 'Parrot'],
                             categories=animals),
    'type': pd.Categorical(['Captive', 'Wild', 'Captive', 'Captive'],
                           categories=types),
    'time': [1, 2, 3, 4]
     })

df.set_index(['animal', 'type'], inplace=True)
df.groupby(level=['animal', 'type'], observed=False).sum()
index time
('Falcon', 'Captive') 1
('Falcon', 'Wild') 2
('Parrot', 'Captive') 7
('Parrot', 'Wild') 0

But when using Timedelta data values an int is returned, instead of a Timedelta.

import pandas as pd

animals = ['Falcon', 'Parrot']
types = ['Captive', 'Wild']

df = pd.DataFrame({
    'animal': pd.Categorical(['Falcon', 'Falcon', 'Parrot', 'Parrot'],
                             categories=animals),
    'type': pd.Categorical(['Captive', 'Wild', 'Captive', 'Captive'],
                           categories=types),
    'time': [1, 2, 3, 4]
     })
# Convert time to time delta.
df['time'] = pd.to_timedelta(df['time'])
df.set_index(['animal', 'type'], inplace=True)
df.groupby(level=['animal', 'type'], observed=False).sum()

Error:

  ---------------------------------------------------------------------------
  TypeError                                 Traceback (most recent call last)
  /tmp/ipykernel_106514/2243147148.py in <module>
       14 df['time'] = pd.to_timedelta(df['time'])
       15 df.set_index(['animal', 'type'], inplace=True)
  ---> 16 df.groupby(level=['animal', 'type'], observed=False).sum()

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in sum(self, numeric_only, min_count)
     1851             )
     1852 
  -> 1853         return self._reindex_output(result, fill_value=0)
     1854 
     1855     @final

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in _reindex_output(self, output, fill_value)
     3169                 "fill_value": fill_value,
     3170             }
  -> 3171             return output.reindex(**d)
     3172 
     3173         # GH 13204

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
      322         @wraps(func)
      323         def wrapper(*args, **kwargs) -> Callable[..., Any]:
  --> 324             return func(*args, **kwargs)
      325 
      326         kind = inspect.Parameter.POSITIONAL_OR_KEYWORD

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
     4770         kwargs.pop("axis", None)
     4771         kwargs.pop("labels", None)
  -> 4772         return super().reindex(**kwargs)
     4773 
     4774     @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
     4816 
     4817         # perform the reindex on the axes
  -> 4818         return self._reindex_axes(
     4819             axes, level, limit, tolerance, method, fill_value, copy
     4820         ).__finalize__(self, method="reindex")

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
     4595         index = axes["index"]
     4596         if index is not None:
  -> 4597             frame = frame._reindex_index(
     4598                 index, method, copy, level, fill_value, limit, tolerance
     4599             )

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
     4614             new_index, method=method, level=level, limit=limit, tolerance=tolerance
     4615         )
  -> 4616         return self._reindex_with_indexers(
     4617             {0: [new_index, indexer]},
     4618             copy=copy,

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
     4881 
     4882             # TODO: speed up on homogeneous DataFrame objects
  -> 4883             new_data = new_data.reindex_indexer(
     4884                 index,
     4885                 indexer,

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/internals/managers.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice)
      678             )
      679         else:
  --> 680             new_blocks = [
      681                 blk.take_nd(
      682                     indexer,

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/internals/managers.py in <listcomp>(.0)
      679         else:
      680             new_blocks = [
  --> 681                 blk.take_nd(
      682                     indexer,
      683                     axis=1,

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/internals/blocks.py in take_nd(self, indexer, axis, new_mgr_locs, fill_value)
     1143             allow_fill = True
     1144 
  -> 1145         new_values = algos.take_nd(
     1146             values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value
     1147         )

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/array_algos/take.py in take_nd(arr, indexer, axis, fill_value, allow_fill)
       99             # i.e. DatetimeArray, TimedeltaArray
      100             arr = cast("NDArrayBackedExtensionArray", arr)
  --> 101             return arr.take(
      102                 indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
      103             )

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/arrays/_mixins.py in take(self, indices, allow_fill, fill_value, axis)
       95     ) -> NDArrayBackedExtensionArrayT:
       96         if allow_fill:
  ---> 97             fill_value = self._validate_scalar(fill_value)
       98 
       99         new_data = take(

  ~/.local/share/virtualenvs/dv-eRvxryI1/lib/python3.9/site-packages/pandas/core/arrays/datetimelike.py in _validate_scalar(self, value, allow_listlike, setitem, unbox)
      643         else:
      644             msg = self._validation_error_message(value, allow_listlike)
  --> 645             raise TypeError(msg)
      646 
      647         if not unbox:

  TypeError: value should be a 'Timedelta' or 'NaT'. Got 'int' instead.

pd.show_versions()

INSTALLED VERSIONS

commit : 73c6825
python : 3.9.2.final.0
python-bits : 64
OS : Linux
OS-release : 5.10.0-8-amd64
Version : #1 SMP Debian 5.10.46-5 (2021-09-23)
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : en_GB.UTF-8
LOCALE : en_GB.UTF-8

pandas : 1.3.3
numpy : 1.19.5
pytz : 2021.3
dateutil : 2.8.2
pip : 21.2.4
setuptools : 58.1.0
Cython : 0.29.24
pytest : 6.2.5
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.6.3
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.0.2
IPython : 7.28.0
pandas_datareader: None
bs4 : 4.10.0
bottleneck : 1.3.2
fsspec : 2021.10.0
fastparquet : None
gcsfs : None
matplotlib : 3.4.3
numexpr : 2.7.3
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyxlsb : None
s3fs : None
scipy : 1.7.1
sqlalchemy : 1.3.24
tables : 3.6.1
tabulate : 0.8.9
xarray : 0.19.0
xlrd : 2.0.1
xlwt : None
numba : 0.54.0

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions