Skip to content

BUG: groupby aggregation with dropna=False, nullable integer dtype and NA generates NumPy IndexError #49173

Closed
@FenderJazz

Description

@FenderJazz

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd
visits = pd.DataFrame({'k': [2, 2, 3, pd.NA]}, dtype='UInt8')
print(visits.groupby('k', dropna=False).size())

Issue Description

Output (Pandas 1.5.0) is:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In [7], line 3
      1 import pandas as pd
      2 visits = pd.DataFrame({'k': [2, 2, 3, pd.NA]}, dtype='UInt8')
----> 3 print(visits.groupby('k', dropna=False).size())

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\groupby.py:2398, in GroupBy.size(self)
   2385 @final
   2386 @Substitution(name="groupby")
   2387 @Appender(_common_see_also)
   2388 def size(self) -> DataFrame | Series:
   2389     """
   2390     Compute group sizes.
   2391 
   (...)
   2396         or a DataFrame if as_index is False.
   2397     """
-> 2398     result = self.grouper.size()
   2400     if self.axis == 1:
   2401         return DataFrame(
   2402             data=np.tile(result.values, (self.obj.shape[0], 1)),
   2403             columns=result.index,
   2404             index=self.obj.index,
   2405         )

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\ops.py:913, in BaseGrouper.size(self)
    908 @final
    909 def size(self) -> Series:
    910     """
    911     Compute group sizes.
    912     """
--> 913     ids, _, ngroups = self.group_info
    914     out: np.ndarray | list
    915     if ngroups:

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\_libs\properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\ops.py:947, in BaseGrouper.group_info(self)
    945 @cache_readonly
    946 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
--> 947     comp_ids, obs_group_ids = self._get_compressed_codes()
    949     ngroups = len(obs_group_ids)
    950     comp_ids = ensure_platform_int(comp_ids)

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\ops.py:978, in BaseGrouper._get_compressed_codes(self)
    975     # FIXME: compress_group_index's second return value is int64, not intp
    977 ping = self.groupings[0]
--> 978 return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\grouper.py:622, in Grouping.codes(self)
    618 if self._codes is not None:
    619     # _codes is set in __init__ for MultiIndex cases
    620     return self._codes
--> 622 return self._codes_and_uniques[0]

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\_libs\properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\grouper.py:693, in Grouping._codes_and_uniques(self)
    686     uniques = (
    687         self.grouping_vector.result_index._values  # type: ignore[assignment]
    688     )
    689 else:
    690     # GH35667, replace dropna=False with use_na_sentinel=False
    691     # error: Incompatible types in assignment (expression has type "Union[
    692     # ndarray[Any, Any], Index]", variable has type "Categorical")
--> 693     codes, uniques = algorithms.factorize(  # type: ignore[assignment]
    694         self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
    695     )
    696 return codes, uniques

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\algorithms.py:792, in factorize(values, sort, na_sentinel, use_na_sentinel, size_hint)
    786 elif not isinstance(values.dtype, np.dtype):
    787     if (
    788         na_sentinel == -1 or na_sentinel is None
    789     ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
    790         # Avoid using catch_warnings when possible
    791         # GH#46910 - TimelikeOps has deprecated signature
--> 792         codes, uniques = values.factorize(  # type: ignore[call-arg]
    793             use_na_sentinel=na_sentinel is not None
    794         )
    795     else:
    796         na_sentinel_arg = -1 if na_sentinel is None else na_sentinel

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\arrays\masked.py:918, in BaseMaskedArray.factorize(self, na_sentinel, use_na_sentinel)
    916     codes[codes == -1] = na_code
    917     # dummy value for uniques; not used since uniques_mask will be True
--> 918     uniques = np.insert(uniques, na_code, 0)
    919     uniques_mask[na_code] = True
    920 uniques_ea = type(self)(uniques, uniques_mask)

File <__array_function__ internals>:180, in insert(*args, **kwargs)

File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\numpy\lib\function_base.py:5332, in insert(arr, obj, values, axis)
   5330 index = indices.item()
   5331 if index < -N or index > N:
-> 5332     raise IndexError(f"index {obj} is out of bounds for axis {axis} "
   5333                      f"with size {N}")
   5334 if (index < 0):
   5335     index += N

IndexError: index 3 is out of bounds for axis 0 with size 2

There seems to be some commonality between the stack trace and #46601 but that might be superficial.

Expected Behavior

Expected output (and the output with Pandas 1.4.4) is:

k
2       2
3       1
<NA>    1
dtype: int64

Installed Versions

INSTALLED VERSIONS

commit : 87cfe4e
python : 3.9.13.final.0
python-bits : 64
OS : Windows
OS-release : 2012ServerR2
Version : 6.3.9600
machine : AMD64
processor : Intel64 Family 6 Model 26 Stepping 5, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : English_United Kingdom.1252

pandas : 1.5.0
numpy : 1.23.4
pytz : 2022.4
dateutil : 2.8.2
setuptools : 65.5.0
pip : 22.3
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.9.3
jinja2 : 3.1.2
IPython : 8.5.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : None
brotli :
fastparquet : None
fsspec : 2022.8.2
gcsfs : None
matplotlib : 3.6.1
numba : None
numexpr : 2.8.3
odfpy : None
openpyxl : 3.0.10
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : 2022.8.2
scipy : 1.9.1
snappy : None
sqlalchemy : 1.4.42
tables : None
tabulate : None
xarray : None
xlrd : 2.0.1
xlwt : None
zstandard : None
tzdata : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions