Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
from pandas import SparseDtype
df = pd.DataFrame([["a", 0],["b", 1], ["b", 2]], columns=["A","B"])
df["A"].astype(SparseDtype("category"))
# or: df["A"].astype(SparseDtype("category", fill_value="not_in_series"))
Issue Description
I am unable to convert a dense categorical series to a sparse one when I leave the fill_value
at default, or a value which does not exist in the series.
Stacktrace:
ValueError Traceback (most recent call last)
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/IPython/core/formatters.py:706, in PlainTextFormatter.call(self, obj)
699 stream = StringIO()
700 printer = pretty.RepresentationPrinter(stream, self.verbose,
701 self.max_width, self.newline,
702 max_seq_length=self.max_seq_length,
703 singleton_pprinters=self.singleton_printers,
704 type_pprinters=self.type_printers,
705 deferred_pprinters=self.deferred_printers)
--> 706 printer.pretty(obj)
707 printer.flush()
708 return stream.getvalue()
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
407 return meth(obj, self, cycle)
408 if cls is not object
409 and callable(cls.dict.get('repr')):
--> 410 return _repr_pprint(obj, self, cycle)
412 return _default_pprint(obj, self, cycle)
413 finally:
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/IPython/lib/pretty.py:778, in repr_pprint(obj, p, cycle)
776 """A pprint that just redirects to the normal repr function."""
777 # Find newlines and replace them with p.break()
--> 778 output = repr(obj)
779 lines = output.splitlines()
780 with p.group():
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/core/series.py:1550, in Series.repr(self)
1548 # pylint: disable=invalid-repr-returned
1549 repr_params = fmt.get_series_repr_params()
-> 1550 return self.to_string(**repr_params)
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/core/series.py:1643, in Series.to_string(self, buf, na_rep, float_format, header, index, length, dtype, name, max_rows, min_rows)
1597 """
1598 Render a string representation of the Series.
1599
(...)
1629 String representation of Series if buf=None
, otherwise None.
1630 """
1631 formatter = fmt.SeriesFormatter(
1632 self,
1633 name=name,
(...)
1641 max_rows=max_rows,
1642 )
-> 1643 result = formatter.to_string()
1645 # catch contract violations
1646 if not isinstance(result, str):
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/io/formats/format.py:393, in SeriesFormatter.to_string(self)
390 return f"{type(self.series).name}([], {footer})"
392 fmt_index, have_header = self._get_formatted_index()
--> 393 fmt_values = self._get_formatted_values()
395 if self.is_truncated_vertically:
396 n_header_rows = 0
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/io/formats/format.py:377, in SeriesFormatter._get_formatted_values(self)
376 def _get_formatted_values(self) -> list[str]:
--> 377 return format_array(
378 self.tr_series._values,
379 None,
380 float_format=self.float_format,
381 na_rep=self.na_rep,
382 leading_space=self.index,
383 )
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/io/formats/format.py:1326, in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
1311 digits = get_option("display.precision")
1313 fmt_obj = fmt_klass(
1314 values,
1315 digits=digits,
(...)
1323 quoting=quoting,
1324 )
-> 1326 return fmt_obj.get_result()
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/io/formats/format.py:1357, in GenericArrayFormatter.get_result(self)
1356 def get_result(self) -> list[str]:
-> 1357 fmt_values = self._format_strings()
1358 return _make_fixed_width(fmt_values, self.justify)
File ~/repositories/arff-to-parquet/venv/lib/python3.10/site-packages/pandas/io/formats/format.py:1658, in ExtensionArrayFormatter._format_strings(self)
1656 array = values._internal_get_values()
1657 else:
-> 1658 array = np.asarray(values)
1660 fmt_values = format_array(
1661 array,
1662 formatter,
(...)
1670 quoting=self.quoting,
1671 )
1672 return fmt_values
ValueError: object array method not producing an array
Expected Behavior
I expect it to "just work", similar to providing a fill value which does exist in the series, or how it works with other dtypes:
import pandas as pd
from pandas import SparseDtype
df = pd.DataFrame([["a", 0],["b", 1], ["b", 2]], columns=["A","B"])
# works, since "a" is a value present in the series
df["A"].astype(SparseDtype("category", fill_value="a"))
# also works, despite -1 not being present in the series
df["B"].astype(SparseDtype(int, fill_value=-1))
Installed Versions
INSTALLED VERSIONS
commit : 8dab54d
python : 3.10.5.final.0
python-bits : 64
OS : Darwin
OS-release : 21.5.0
Version : Darwin Kernel Version 21.5.0: Tue Apr 26 21:08:37 PDT 2022; root:xnu-8020.121.3~4/RELEASE_ARM64_T6000
machine : arm64
processor : arm
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.UTF-8
pandas : 1.5.2
numpy : 1.23.5
pytz : 2022.6
dateutil : 2.8.2
setuptools : 58.1.0
pip : 22.3.1
Cython : None
pytest : 7.2.0
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.6.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : 3.6.2
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 10.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : 1.9.3
snappy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
zstandard : None
tzdata : None