Skip to content

BUG: Error writing DataFrame with categorical type column and interval data to a CSV file #46297

Closed
@pjireland

Description

@pjireland

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd
df = pd.DataFrame(index=[0], columns=["a"])
df.at[0, "a"] = pd.Interval(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02"))
df["a"] = df["a"].astype("category") # astype("object") does not raise an error
df.to_csv("test.csv")

Issue Description

I get the following error message when trying to run the example above. The error seems to be linked to writing an interval of type category to a CSV file.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [41], in <module>
----> 1 df.to_csv("test.csv")

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\generic.py:3563, in NDFrame.to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal, errors, storage_options)
   3552 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
   3554 formatter = DataFrameFormatter(
   3555     frame=df,
   3556     header=header,
   (...)
   3560     decimal=decimal,
   3561 )
-> 3563 return DataFrameRenderer(formatter).to_csv(
   3564     path_or_buf,
   3565     line_terminator=line_terminator,
   3566     sep=sep,
   3567     encoding=encoding,
   3568     errors=errors,
   3569     compression=compression,
   3570     quoting=quoting,
   3571     columns=columns,
   3572     index_label=index_label,
   3573     mode=mode,
   3574     chunksize=chunksize,
   3575     quotechar=quotechar,
   3576     date_format=date_format,
   3577     doublequote=doublequote,
   3578     escapechar=escapechar,
   3579     storage_options=storage_options,
   3580 )

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\io\formats\format.py:1180, in DataFrameRenderer.to_csv(self, path_or_buf, encoding, sep, columns, index_label, mode, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, errors, storage_options)
   1159     created_buffer = False
   1161 csv_formatter = CSVFormatter(
   1162     path_or_buf=path_or_buf,
   1163     line_terminator=line_terminator,
   (...)
   1178     formatter=self.fmt,
   1179 )
-> 1180 csv_formatter.save()
   1182 if created_buffer:
   1183     assert isinstance(path_or_buf, StringIO)

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\io\formats\csvs.py:261, in CSVFormatter.save(self)
    241 with get_handle(
    242     self.filepath_or_buffer,
    243     self.mode,
   (...)
    249 
    250     # Note: self.encoding is irrelevant here
    251     self.writer = csvlib.writer(
    252         handles.handle,
    253         lineterminator=self.line_terminator,
   (...)
    258         quotechar=self.quotechar,
    259     )
--> 261     self._save()

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\io\formats\csvs.py:266, in CSVFormatter._save(self)
    264 if self._need_to_save_header:
    265     self._save_header()
--> 266 self._save_body()

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\io\formats\csvs.py:304, in CSVFormatter._save_body(self)
    302 if start_i >= end_i:
    303     break
--> 304 self._save_chunk(start_i, end_i)

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\io\formats\csvs.py:311, in CSVFormatter._save_chunk(self, start_i, end_i)
    308 slicer = slice(start_i, end_i)
    309 df = self.obj.iloc[slicer]
--> 311 res = df._mgr.to_native_types(**self._number_format)
    312 data = [res.iget_values(i) for i in range(len(res.items))]
    314 ix = self.data_index[slicer]._format_native_types(**self._number_format)

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\internals\managers.py:473, in BaseBlockManager.to_native_types(self, **kwargs)
    468 def to_native_types(self: T, **kwargs) -> T:
    469     """
    470     Convert values to native types (strings / python objects) that are used
    471     in formatting (repr / csv).
    472     """
--> 473     return self.apply("to_native_types", **kwargs)

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\internals\managers.py:304, in BaseBlockManager.apply(self, f, align_keys, ignore_failures, **kwargs)
    302         applied = b.apply(f, **kwargs)
    303     else:
--> 304         applied = getattr(b, f)(**kwargs)
    305 except (TypeError, NotImplementedError):
    306     if not ignore_failures:

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\internals\blocks.py:636, in Block.to_native_types(self, na_rep, quoting, **kwargs)
    633 @final
    634 def to_native_types(self, na_rep="nan", quoting=None, **kwargs):
    635     """convert to our native types format"""
--> 636     result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs)
    637     return self.make_block(result)

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\internals\blocks.py:2148, in to_native_types(values, na_rep, quoting, float_format, decimal, **kwargs)
   2145 """convert to our native types format"""
   2146 if isinstance(values, Categorical):
   2147     # GH#40754 Convert categorical datetimes to datetime array
-> 2148     values = take_nd(
   2149         values.categories._values,
   2150         ensure_platform_int(values._codes),
   2151         fill_value=na_rep,
   2152     )
   2154 values = ensure_wrapped_if_datetimelike(values)
   2156 if isinstance(values, (DatetimeArray, TimedeltaArray)):

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\array_algos\take.py:114, in take_nd(arr, indexer, axis, fill_value, allow_fill)
    109         arr = cast("NDArrayBackedExtensionArray", arr)
    110         return arr.take(
    111             indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
    112         )
--> 114     return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
    116 arr = np.asarray(arr)
    117 return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\arrays\interval.py:1060, in IntervalArray.take(self, indices, allow_fill, fill_value, axis, **kwargs)
   1058 fill_left = fill_right = fill_value
   1059 if allow_fill:
-> 1060     fill_left, fill_right = self._validate_scalar(fill_value)
   1062 left_take = take(
   1063     self._left, indices, allow_fill=allow_fill, fill_value=fill_left
   1064 )
   1065 right_take = take(
   1066     self._right, indices, allow_fill=allow_fill, fill_value=fill_right
   1067 )

File ~\Anaconda3\envs\wedev\lib\site-packages\pandas\core\arrays\interval.py:1102, in IntervalArray._validate_scalar(self, value)
   1100     left = right = value
   1101 else:
-> 1102     raise TypeError(
   1103         "can only insert Interval objects and NA into an IntervalArray"
   1104     )
   1105 return left, right

TypeError: can only insert Interval objects and NA into an IntervalArray

Expected Behavior

I expect the writing to a CSV to work successfully as is the case if I replace astype("category") with astype("object") in the example above.

Installed Versions

INSTALLED VERSIONS

commit : bb1f651
python : 3.9.10.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.19042
machine : AMD64
processor : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : English_United States.1252

pandas : 1.4.0
numpy : 1.21.5
pytz : 2021.3
dateutil : 2.8.2
pip : 22.0.2
setuptools : 59.8.0
Cython : 0.29.27
pytest : 6.2.5
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : 3.0.2
lxml.etree : 4.7.1
html5lib : None
pymysql : 1.0.2
psycopg2 : None
jinja2 : 3.0.3
IPython : 8.0.1
pandas_datareader: None
bs4 : 4.10.0
bottleneck : None
fastparquet : 0.8.0
fsspec : 2022.01.0
gcsfs : None
matplotlib : 3.5.1
numba : 0.55.1
numexpr : 2.7.3
odfpy : None
openpyxl : 3.0.9
pandas_gbq : None
pyarrow : 3.0.0
pyreadstat : None
pyxlsb : None
s3fs : 0.4.2
scipy : 1.7.3
sqlalchemy : 1.4.31
tables : 3.7.0
tabulate : 0.8.9
xarray : 0.21.1
xlrd : 2.0.1
xlwt : None
zstandard : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    BugCategoricalCategorical Data TypeIO CSVread_csv, to_csvNA - MaskedArraysRelated to pd.NA and nullable extension arraysRegressionFunctionality that used to work in a prior pandas version

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions