Description
Code Sample
Build the model data frame:
df = pd.DataFrame({
'nr': [1,2,3,4,5,6,7,8],
'cat_ord': list('aabbccdd'),
'cat':list('aaaabbbb')
})
df = df.astype({'cat': 'category', 'cat_ord': 'category'})
df['cat_ord'] = df['cat_ord'].cat.as_ordered()
When grouping, single aggregations on a numeric column work:
df.groupby('cat').agg({'nr': 'min'})
nr
cat
a 1
b 5
Single aggregations on an ordered categorical column work, but drop the grouping index:
df.groupby('cat').agg({'cat_ord': 'min'})
cat_ord
0 a
1 c
Combined single aggregations on a numeric and an ordered categorical column work:
df.groupby('cat').agg({'nr': 'min', 'cat_ord': 'min'})
nr cat_ord
cat
a 1 a
b 5 c
Multiple aggregations on an ordered categorical column work, but drop the grouping index:
df.groupby('cat').agg({'cat_ord': ['min', 'max']})
cat_ord
min max
0 a b
1 c d
Combined aggregations on a numeric (single) and an ordered categorical column (multiple) fail with a TypeError:
df.groupby('cat').agg({'nr': 'min', 'cat_ord': ['min', 'max']})
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-148-b446de510106> in <module>
----> 1 df.groupby('cat').agg({'nr': 'min', 'cat_ord': ['min', 'max']})
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, arg, *args, **kwargs)
1453 @Appender(_shared_docs["aggregate"])
1454 def aggregate(self, arg=None, *args, **kwargs):
-> 1455 return super().aggregate(arg, *args, **kwargs)
1456
1457 agg = aggregate
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, *args, **kwargs)
227 func = _maybe_mangle_lambdas(func)
228
--> 229 result, how = self._aggregate(func, _level=_level, *args, **kwargs)
230 if how is None:
231 return result
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\base.py in _aggregate(self, arg, *args, **kwargs)
528 # return a MI DataFrame
529
--> 530 return concat([result[k] for k in keys], keys=keys, axis=1), True
531
532 elif isinstance(self, ABCSeries) and is_any_series():
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
256 )
257
--> 258 return op.get_result()
259
260
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in get_result(self)
466 obj_labels = mgr.axes[ax]
467 if not new_labels.equals(obj_labels):
--> 468 indexers[ax] = obj_labels.reindex(new_labels)[1]
469
470 mgrs_indexers.append((obj._data, indexers))
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\category.py in reindex(self, target, method, level, limit, tolerance)
616 # coerce to a regular index here!
617 result = Index(np.array(self), name=self.name)
--> 618 new_target, indexer, _ = result._reindex_non_unique(np.array(target))
619 else:
620
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _reindex_non_unique(self, target)
3434
3435 target = ensure_index(target)
-> 3436 indexer, missing = self.get_indexer_non_unique(target)
3437 check = indexer != -1
3438 new_labels = self.take(indexer[check])
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_indexer_non_unique(self, target)
4792 tgt_values = target._ndarray_values
4793
-> 4794 indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
4795 return ensure_platform_int(indexer), missing
4796
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_indexer_non_unique()
TypeError: '<' not supported between instances of 'str' and 'int'
Combined aggregations on a numeric (multiple) and an ordered categorical column (single) also fail with the same TypeError:
df.groupby('cat').agg({'nr': ['min', 'max'], 'cat_ord': 'min'})
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-164-b1d70184bd81> in <module>
----> 1 df.groupby('cat').agg({'nr': ['min', 'max'], 'cat_ord': 'min'})
...
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_indexer_non_unique()
TypeError: '<' not supported between instances of 'str' and 'int'
Problem description
Aggregations on ordered categoricals drop the grouping index, or crash, as shown above.
This makes it hard to calculate combined aggregations over big data sets correctly and efficiently.
Expected Output
Aggregations on ordered categoricals should work as on non-categorical columns.
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Windows
OS-release : 10
machine : AMD64
processor : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 0.25.0
numpy : 1.16.4
pytz : 2019.1
dateutil : 2.8.0
pip : 19.1.1
setuptools : 41.0.1
Cython : 0.29.12
pytest : 5.0.1
hypothesis : None
sphinx : 2.1.2
blosc : None
feather : None
xlsxwriter : 1.1.8
lxml.etree : 4.3.4
html5lib : 1.0.1
pymysql : None
psycopg2 : None
jinja2 : 2.10.1
IPython : 7.7.0
pandas_datareader: None
bs4 : 4.7.1
bottleneck : 1.2.1
fastparquet : None
gcsfs : None
lxml.etree : 4.3.4
matplotlib : 3.1.1
numexpr : 2.6.9
odfpy : None
openpyxl : 2.6.2
pandas_gbq : None
pyarrow : 0.11.1
pytables : None
s3fs : None
scipy : 1.3.0
sqlalchemy : 1.3.5
tables : 3.5.2
xarray : None
xlrd : 1.2.0
xlwt : 1.3.0
xlsxwriter : 1.1.8