Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
df = sns.load_dataset('diamonds')
table = pa.Table.from_pandas(df)
df2 = table.to_pandas(types_mapper=pd.ArrowDtype)
df2.convert_dtypes(dtype_backend='numpy_nullable')
Issue Description
I get an error KeyError: DictionaryType(dictionary<values=string, indices=int8, ordered=0>)
with a traceback like the following:
Click me
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[27], line 9
7 table = pa.Table.from_pandas(df)
8 df2 = table.to_pandas(types_mapper=pd.ArrowDtype)
----> 9 df2.convert_dtypes(dtype_backend='numpy_nullable')
File [~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/generic.py:7025](http://localhost:8888/lab/tree/~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/generic.py#line=7024), in NDFrame.convert_dtypes(self, infer_objects, convert_string, convert_integer, convert_boolean, convert_floating, dtype_backend)
6896 """
6897 Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.
6898
(...)
7022 dtype: string
7023 """
7024 check_dtype_backend(dtype_backend)
-> 7025 new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr]
7026 infer_objects=infer_objects,
7027 convert_string=convert_string,
7028 convert_integer=convert_integer,
7029 convert_boolean=convert_boolean,
7030 convert_floating=convert_floating,
7031 dtype_backend=dtype_backend,
7032 )
7033 res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
7034 return res.__finalize__(self, method="convert_dtypes")
File [~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/managers.py:456](http://localhost:8888/lab/tree/~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/managers.py#line=455), in BaseBlockManager.convert_dtypes(self, **kwargs)
453 else:
454 copy = True
--> 456 return self.apply(
457 "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs
458 )
File [~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/managers.py:364](http://localhost:8888/lab/tree/~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/managers.py#line=363), in BaseBlockManager.apply(self, f, align_keys, **kwargs)
362 applied = b.apply(f, **kwargs)
363 else:
--> 364 applied = getattr(b, f)(**kwargs)
365 result_blocks = extend_blocks(applied, result_blocks)
367 out = type(self).from_blocks(result_blocks, self.axes)
File [~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/blocks.py:694](http://localhost:8888/lab/tree/~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/blocks.py#line=693), in Block.convert_dtypes(self, copy, using_cow, infer_objects, convert_string, convert_integer, convert_boolean, convert_floating, dtype_backend)
691 for blk in blks:
692 # Determine dtype column by column
693 sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split()
--> 694 dtypes = [
695 convert_dtypes(
696 b.values,
697 convert_string,
698 convert_integer,
699 convert_boolean,
700 convert_floating,
701 infer_objects,
702 dtype_backend,
703 )
704 for b in sub_blks
705 ]
706 if all(dtype == self.dtype for dtype in dtypes):
707 # Avoid block splitting if no dtype changes
708 rbs.append(blk.copy(deep=copy))
File [~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/blocks.py:695](http://localhost:8888/lab/tree/~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/internals/blocks.py#line=694), in <listcomp>(.0)
691 for blk in blks:
692 # Determine dtype column by column
693 sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split()
694 dtypes = [
--> 695 convert_dtypes(
696 b.values,
697 convert_string,
698 convert_integer,
699 convert_boolean,
700 convert_floating,
701 infer_objects,
702 dtype_backend,
703 )
704 for b in sub_blks
705 ]
706 if all(dtype == self.dtype for dtype in dtypes):
707 # Avoid block splitting if no dtype changes
708 rbs.append(blk.copy(deep=copy))
File [~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1150](http://localhost:8888/lab/tree/~/work/keepdb/env/lib/python3.11/site-packages/pandas/core/dtypes/cast.py#line=1149), in convert_dtypes(input_array, convert_string, convert_integer, convert_boolean, convert_floating, infer_objects, dtype_backend)
1147 inferred_dtype = ArrowDtype(pa_type)
1148 elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype):
1149 # GH 53648
-> 1150 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype]
1152 # error: Incompatible return value type (got "Union[str, Union[dtype[Any],
1153 # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
1154 return inferred_dtype
KeyError: DictionaryType(dictionary<values=string, indices=int8, ordered=0>)
Expected Behavior
I would expect df2.convert_dtypes()
to run without error and return a DataFrame.
Installed Versions
INSTALLED VERSIONS
commit : fd3f571
python : 3.11.2.final.0
python-bits : 64
OS : Darwin
OS-release : 23.2.0
Version : Darwin Kernel Version 23.2.0: Wed Nov 15 21:55:06 PST 2023; root:xnu-10002.61.3~2/RELEASE_ARM64_T6020
machine : arm64
processor : arm
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 2.2.0
numpy : 1.26.4
pytz : 2024.1
dateutil : 2.8.2
setuptools : 69.1.0
pip : 24.0
Cython : None
pytest : 8.0.0
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.3
IPython : 8.21.0
pandas_datareader : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
bottleneck : None
dataframe-api-compat : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : 3.8.2
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 15.0.0
pyreadstat : None
python-calamine : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
zstandard : None
tzdata : 2024.1
qtpy : None
pyqt5 : None