Skip to content

BUG: Pyarrow backend fails to export index with timezone ArrowInvalid exception #56282

Open
@mattharrison

Description

@mattharrison

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd
import io
data = '''agency_cd,site_no,datetime,tz_cd,144166_00060,144166_00060_cd,144167_00065,144167_00065_cd
USGS,9333500,2001-05-07 01:00:00,MDT,71.0,A:[91],,
USGS,9333500,2001-05-07 01:15:00,MDT,71.0,A:[91],,
USGS,9333500,2001-05-07 01:30:00,MDT,71.0,A:[91],,
USGS,9333500,2001-05-07 01:45:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 02:00:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 02:15:00,MDT,69.0,A:[91],,
USGS,9333500,2001-05-07 02:30:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 02:45:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 03:00:00,MDT,70.0,A:[91],,'''
df = pd.read_csv(io.StringIO(data), 
                    dtype_backend='pyarrow', engine='pyarrow')
def to_denver_time(df_, time_col, tz_col):
    return (df_
     .assign(**{tz_col: df_[tz_col].replace('MDT', 'MST7MDT')})
     .groupby(tz_col)
     [time_col]
     .transform(lambda s: pd.to_datetime(s)
         .dt.tz_localize(s.name, ambiguous=True)
         .dt.tz_convert('America/Denver'))
    )

def tweak_river(df_):
    return (df_
     .assign(datetime=to_denver_time(df_, 'datetime', 'tz_cd'))
     .rename(columns={'144166_00060': 'cfs',
                      '144167_00065': 'gage_height'})
     .loc[:, ['datetime', 'agency_cd', 'site_no', 'tz_cd', 'cfs',
              'gage_height'] ]  
     .set_index('datetime')
    )

dd = tweak_river(df)
print(dd)
dd.to_parquet('/tmp/dd.pq')
pd.read_parquet('/tmp/dd.pq', dtype_backend='pyarrow'
               )

Issue Description

The read_parquet function fails to read this with the pyarrow backend.

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[116], line 38
     36 print(dd)
     37 dd.to_parquet('/tmp/dd.pq')
---> 38 pd.read_parquet('/tmp/dd.pq', dtype_backend='pyarrow'
     39                )

File ~/.envs/menv/lib/python3.10/site-packages/pandas/io/parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
    667     use_nullable_dtypes = False
    668 check_dtype_backend(dtype_backend)
--> 670 return impl.read(
    671     path,
    672     columns=columns,
    673     filters=filters,
    674     storage_options=storage_options,
    675     use_nullable_dtypes=use_nullable_dtypes,
    676     dtype_backend=dtype_backend,
    677     filesystem=filesystem,
    678     **kwargs,
    679 )

File ~/.envs/menv/lib/python3.10/site-packages/pandas/io/parquet.py:279, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
    271 try:
    272     pa_table = self.api.parquet.read_table(
    273         path_or_handle,
    274         columns=columns,
   (...)
    277         **kwargs,
    278     )
--> 279     result = pa_table.to_pandas(**to_pandas_kwargs)
    281     if manager == "array":
    282         result = result._as_manager("array", copy=False)

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/array.pxi:867, in pyarrow.lib._PandasConvertible.to_pandas()

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/table.pxi:4085, in pyarrow.lib.Table._to_pandas()

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:762, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    760     index_descriptors = pandas_metadata['index_columns']
    761     table = _add_any_metadata(table, pandas_metadata)
--> 762     table, index = _reconstruct_index(table, index_descriptors,
    763                                       all_columns, types_mapper)
    764     ext_columns_dtypes = _get_extension_dtypes(
    765         table, all_columns, types_mapper)
    766 else:

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:913, in _reconstruct_index(table, index_descriptors, all_columns, types_mapper)
    911 for descr in index_descriptors:
    912     if isinstance(descr, str):
--> 913         result_table, index_level, index_name = _extract_index_level(
    914             table, result_table, descr, field_name_to_metadata, types_mapper)
    915         if index_level is None:
    916             # ARROW-1883: the serialized index column was not found
    917             continue

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:970, in _extract_index_level(table, result_table, field_name, field_name_to_metadata, types_mapper)
    967     values = values.copy()
    969 if isinstance(col.type, pa.lib.TimestampType) and col.type.tz is not None:
--> 970     index_level = make_tz_aware(pd.Series(values, copy=False), col.type.tz)
    971 else:
    972     index_level = pd.Series(values, dtype=values.dtype, copy=False)

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1216, in make_tz_aware(series, tz)
   1212 """
   1213 Make a datetime64 Series timezone-aware for the given tz
   1214 """
   1215 tz = pa.lib.string_to_tzinfo(tz)
-> 1216 series = (series.dt.tz_localize('utc')
   1217                 .dt.tz_convert(tz))
   1218 return series

File ~/.envs/menv/lib/python3.10/site-packages/pandas/core/accessor.py:112, in PandasDelegate._add_delegate_accessors.<locals>._create_delegator_method.<locals>.f(self, *args, **kwargs)
    111 def f(self, *args, **kwargs):
--> 112     return self._delegate_method(name, *args, **kwargs)

File ~/.envs/menv/lib/python3.10/site-packages/pandas/core/indexes/accessors.py:205, in ArrowTemporalProperties._delegate_method(self, name, *args, **kwargs)
    200 if not hasattr(self._parent.array, f"_dt_{name}"):
    201     raise NotImplementedError(
    202         f"dt.{name} is not supported for {self._parent.dtype}"
    203     )
--> 205 result = getattr(self._parent.array, f"_dt_{name}")(*args, **kwargs)
    207 if self._orig is not None:
    208     index = self._orig.index

File ~/.envs/menv/lib/python3.10/site-packages/pandas/core/arrays/arrow/array.py:2541, in ArrowExtensionArray._dt_tz_localize(self, tz, ambiguous, nonexistent)
   2539     result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
   2540 else:
-> 2541     result = pc.assume_timezone(
   2542         self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
   2543     )
   2544 return type(self)(result)

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/compute.py:262, in _make_generic_wrapper.<locals>.wrapper(memory_pool, options, *args, **kwargs)
    260 if args and isinstance(args[0], Expression):
    261     return Expression._call(func_name, list(args), options)
--> 262 return func.call(args, options, memory_pool)

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/_compute.pyx:367, in pyarrow._compute.Function.call()

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()

ArrowInvalid: Timestamps already have a timezone: 'America/Denver'. Cannot localize to 'utc'.
dd.index

Expected Behavior

This works when I don't specify the dtype_backend

Installed Versions

INSTALLED VERSIONS

commit : e86ed37
python : 3.10.13.final.0
python-bits : 64
OS : Darwin
OS-release : 21.6.0
Version : Darwin Kernel Version 21.6.0: Wed Aug 10 14:28:23 PDT 2022; root:xnu-8020.141.5~2/RELEASE_ARM64_T6000
machine : arm64
processor : arm
byteorder : little
LC_ALL : en_US.UTF-8
LANG : None
LOCALE : en_US.UTF-8

pandas : 2.1.1
numpy : 1.23.5
pytz : 2023.3
dateutil : 2.8.2
setuptools : 67.6.1
pip : 23.2.1
Cython : 3.0.4
pytest : 7.2.0
hypothesis : 6.81.2
sphinx : None
blosc : None
feather : None
xlsxwriter : 3.1.2
lxml.etree : 4.9.2
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.8.0
pandas_datareader : None
bs4 : 4.11.1
bottleneck : None
dataframe-api-compat: None
fastparquet : None
fsspec : 2023.3.0
gcsfs : None
matplotlib : 3.6.2
numba : 0.56.4
numexpr : 2.8.4
odfpy : None
openpyxl : 3.0.10
pandas_gbq : None
pyarrow : 13.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : 1.10.0
sqlalchemy : 2.0.21
tables : None
tabulate : 0.9.0
xarray : None
xlrd : 2.0.1
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    Arrowpyarrow functionalityBugTimezonesTimezone data dtypeUpstream issueIssue related to pandas dependency

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions