Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
visits = pd.DataFrame({'k': [2, 2, 3, pd.NA]}, dtype='UInt8')
print(visits.groupby('k', dropna=False).size())
Issue Description
Output (Pandas 1.5.0) is:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In [7], line 3
1 import pandas as pd
2 visits = pd.DataFrame({'k': [2, 2, 3, pd.NA]}, dtype='UInt8')
----> 3 print(visits.groupby('k', dropna=False).size())
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\groupby.py:2398, in GroupBy.size(self)
2385 @final
2386 @Substitution(name="groupby")
2387 @Appender(_common_see_also)
2388 def size(self) -> DataFrame | Series:
2389 """
2390 Compute group sizes.
2391
(...)
2396 or a DataFrame if as_index is False.
2397 """
-> 2398 result = self.grouper.size()
2400 if self.axis == 1:
2401 return DataFrame(
2402 data=np.tile(result.values, (self.obj.shape[0], 1)),
2403 columns=result.index,
2404 index=self.obj.index,
2405 )
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\ops.py:913, in BaseGrouper.size(self)
908 @final
909 def size(self) -> Series:
910 """
911 Compute group sizes.
912 """
--> 913 ids, _, ngroups = self.group_info
914 out: np.ndarray | list
915 if ngroups:
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\_libs\properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\ops.py:947, in BaseGrouper.group_info(self)
945 @cache_readonly
946 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
--> 947 comp_ids, obs_group_ids = self._get_compressed_codes()
949 ngroups = len(obs_group_ids)
950 comp_ids = ensure_platform_int(comp_ids)
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\ops.py:978, in BaseGrouper._get_compressed_codes(self)
975 # FIXME: compress_group_index's second return value is int64, not intp
977 ping = self.groupings[0]
--> 978 return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\grouper.py:622, in Grouping.codes(self)
618 if self._codes is not None:
619 # _codes is set in __init__ for MultiIndex cases
620 return self._codes
--> 622 return self._codes_and_uniques[0]
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\_libs\properties.pyx:36, in pandas._libs.properties.CachedProperty.__get__()
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\groupby\grouper.py:693, in Grouping._codes_and_uniques(self)
686 uniques = (
687 self.grouping_vector.result_index._values # type: ignore[assignment]
688 )
689 else:
690 # GH35667, replace dropna=False with use_na_sentinel=False
691 # error: Incompatible types in assignment (expression has type "Union[
692 # ndarray[Any, Any], Index]", variable has type "Categorical")
--> 693 codes, uniques = algorithms.factorize( # type: ignore[assignment]
694 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
695 )
696 return codes, uniques
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\algorithms.py:792, in factorize(values, sort, na_sentinel, use_na_sentinel, size_hint)
786 elif not isinstance(values.dtype, np.dtype):
787 if (
788 na_sentinel == -1 or na_sentinel is None
789 ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
790 # Avoid using catch_warnings when possible
791 # GH#46910 - TimelikeOps has deprecated signature
--> 792 codes, uniques = values.factorize( # type: ignore[call-arg]
793 use_na_sentinel=na_sentinel is not None
794 )
795 else:
796 na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\pandas\core\arrays\masked.py:918, in BaseMaskedArray.factorize(self, na_sentinel, use_na_sentinel)
916 codes[codes == -1] = na_code
917 # dummy value for uniques; not used since uniques_mask will be True
--> 918 uniques = np.insert(uniques, na_code, 0)
919 uniques_mask[na_code] = True
920 uniques_ea = type(self)(uniques, uniques_mask)
File <__array_function__ internals>:180, in insert(*args, **kwargs)
File C:\ProgramData\Anaconda3\envs\test\lib\site-packages\numpy\lib\function_base.py:5332, in insert(arr, obj, values, axis)
5330 index = indices.item()
5331 if index < -N or index > N:
-> 5332 raise IndexError(f"index {obj} is out of bounds for axis {axis} "
5333 f"with size {N}")
5334 if (index < 0):
5335 index += N
IndexError: index 3 is out of bounds for axis 0 with size 2
There seems to be some commonality between the stack trace and #46601 but that might be superficial.
Expected Behavior
Expected output (and the output with Pandas 1.4.4) is:
k
2 2
3 1
<NA> 1
dtype: int64
Installed Versions
INSTALLED VERSIONS
commit : 87cfe4e
python : 3.9.13.final.0
python-bits : 64
OS : Windows
OS-release : 2012ServerR2
Version : 6.3.9600
machine : AMD64
processor : Intel64 Family 6 Model 26 Stepping 5, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : English_United Kingdom.1252
pandas : 1.5.0
numpy : 1.23.4
pytz : 2022.4
dateutil : 2.8.2
setuptools : 65.5.0
pip : 22.3
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : 2.9.3
jinja2 : 3.1.2
IPython : 8.5.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : None
brotli :
fastparquet : None
fsspec : 2022.8.2
gcsfs : None
matplotlib : 3.6.1
numba : None
numexpr : 2.8.3
odfpy : None
openpyxl : 3.0.10
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : 2022.8.2
scipy : 1.9.1
snappy : None
sqlalchemy : 1.4.42
tables : None
tabulate : None
xarray : None
xlrd : 2.0.1
xlwt : None
zstandard : None
tzdata : None