Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
import io
data = '''agency_cd,site_no,datetime,tz_cd,144166_00060,144166_00060_cd,144167_00065,144167_00065_cd
USGS,9333500,2001-05-07 01:00:00,MDT,71.0,A:[91],,
USGS,9333500,2001-05-07 01:15:00,MDT,71.0,A:[91],,
USGS,9333500,2001-05-07 01:30:00,MDT,71.0,A:[91],,
USGS,9333500,2001-05-07 01:45:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 02:00:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 02:15:00,MDT,69.0,A:[91],,
USGS,9333500,2001-05-07 02:30:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 02:45:00,MDT,70.0,A:[91],,
USGS,9333500,2001-05-07 03:00:00,MDT,70.0,A:[91],,'''
df = pd.read_csv(io.StringIO(data),
dtype_backend='pyarrow', engine='pyarrow')
def to_denver_time(df_, time_col, tz_col):
return (df_
.assign(**{tz_col: df_[tz_col].replace('MDT', 'MST7MDT')})
.groupby(tz_col)
[time_col]
.transform(lambda s: pd.to_datetime(s)
.dt.tz_localize(s.name, ambiguous=True)
.dt.tz_convert('America/Denver'))
)
def tweak_river(df_):
return (df_
.assign(datetime=to_denver_time(df_, 'datetime', 'tz_cd'))
.rename(columns={'144166_00060': 'cfs',
'144167_00065': 'gage_height'})
.loc[:, ['datetime', 'agency_cd', 'site_no', 'tz_cd', 'cfs',
'gage_height'] ]
.set_index('datetime')
)
dd = tweak_river(df)
print(dd)
dd.to_parquet('/tmp/dd.pq')
pd.read_parquet('/tmp/dd.pq', dtype_backend='pyarrow'
)
Issue Description
The read_parquet
function fails to read this with the pyarrow backend.
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
Cell In[116], line 38
36 print(dd)
37 dd.to_parquet('/tmp/dd.pq')
---> 38 pd.read_parquet('/tmp/dd.pq', dtype_backend='pyarrow'
39 )
File ~/.envs/menv/lib/python3.10/site-packages/pandas/io/parquet.py:670, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
667 use_nullable_dtypes = False
668 check_dtype_backend(dtype_backend)
--> 670 return impl.read(
671 path,
672 columns=columns,
673 filters=filters,
674 storage_options=storage_options,
675 use_nullable_dtypes=use_nullable_dtypes,
676 dtype_backend=dtype_backend,
677 filesystem=filesystem,
678 **kwargs,
679 )
File ~/.envs/menv/lib/python3.10/site-packages/pandas/io/parquet.py:279, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
271 try:
272 pa_table = self.api.parquet.read_table(
273 path_or_handle,
274 columns=columns,
(...)
277 **kwargs,
278 )
--> 279 result = pa_table.to_pandas(**to_pandas_kwargs)
281 if manager == "array":
282 result = result._as_manager("array", copy=False)
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/array.pxi:867, in pyarrow.lib._PandasConvertible.to_pandas()
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/table.pxi:4085, in pyarrow.lib.Table._to_pandas()
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:762, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
760 index_descriptors = pandas_metadata['index_columns']
761 table = _add_any_metadata(table, pandas_metadata)
--> 762 table, index = _reconstruct_index(table, index_descriptors,
763 all_columns, types_mapper)
764 ext_columns_dtypes = _get_extension_dtypes(
765 table, all_columns, types_mapper)
766 else:
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:913, in _reconstruct_index(table, index_descriptors, all_columns, types_mapper)
911 for descr in index_descriptors:
912 if isinstance(descr, str):
--> 913 result_table, index_level, index_name = _extract_index_level(
914 table, result_table, descr, field_name_to_metadata, types_mapper)
915 if index_level is None:
916 # ARROW-1883: the serialized index column was not found
917 continue
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:970, in _extract_index_level(table, result_table, field_name, field_name_to_metadata, types_mapper)
967 values = values.copy()
969 if isinstance(col.type, pa.lib.TimestampType) and col.type.tz is not None:
--> 970 index_level = make_tz_aware(pd.Series(values, copy=False), col.type.tz)
971 else:
972 index_level = pd.Series(values, dtype=values.dtype, copy=False)
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1216, in make_tz_aware(series, tz)
1212 """
1213 Make a datetime64 Series timezone-aware for the given tz
1214 """
1215 tz = pa.lib.string_to_tzinfo(tz)
-> 1216 series = (series.dt.tz_localize('utc')
1217 .dt.tz_convert(tz))
1218 return series
File ~/.envs/menv/lib/python3.10/site-packages/pandas/core/accessor.py:112, in PandasDelegate._add_delegate_accessors.<locals>._create_delegator_method.<locals>.f(self, *args, **kwargs)
111 def f(self, *args, **kwargs):
--> 112 return self._delegate_method(name, *args, **kwargs)
File ~/.envs/menv/lib/python3.10/site-packages/pandas/core/indexes/accessors.py:205, in ArrowTemporalProperties._delegate_method(self, name, *args, **kwargs)
200 if not hasattr(self._parent.array, f"_dt_{name}"):
201 raise NotImplementedError(
202 f"dt.{name} is not supported for {self._parent.dtype}"
203 )
--> 205 result = getattr(self._parent.array, f"_dt_{name}")(*args, **kwargs)
207 if self._orig is not None:
208 index = self._orig.index
File ~/.envs/menv/lib/python3.10/site-packages/pandas/core/arrays/arrow/array.py:2541, in ArrowExtensionArray._dt_tz_localize(self, tz, ambiguous, nonexistent)
2539 result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
2540 else:
-> 2541 result = pc.assume_timezone(
2542 self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
2543 )
2544 return type(self)(result)
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/compute.py:262, in _make_generic_wrapper.<locals>.wrapper(memory_pool, options, *args, **kwargs)
260 if args and isinstance(args[0], Expression):
261 return Expression._call(func_name, list(args), options)
--> 262 return func.call(args, options, memory_pool)
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/_compute.pyx:367, in pyarrow._compute.Function.call()
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File ~/.envs/menv/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: Timestamps already have a timezone: 'America/Denver'. Cannot localize to 'utc'.
dd.index
Expected Behavior
This works when I don't specify the dtype_backend
Installed Versions
INSTALLED VERSIONS
commit : e86ed37
python : 3.10.13.final.0
python-bits : 64
OS : Darwin
OS-release : 21.6.0
Version : Darwin Kernel Version 21.6.0: Wed Aug 10 14:28:23 PDT 2022; root:xnu-8020.141.5~2/RELEASE_ARM64_T6000
machine : arm64
processor : arm
byteorder : little
LC_ALL : en_US.UTF-8
LANG : None
LOCALE : en_US.UTF-8
pandas : 2.1.1
numpy : 1.23.5
pytz : 2023.3
dateutil : 2.8.2
setuptools : 67.6.1
pip : 23.2.1
Cython : 3.0.4
pytest : 7.2.0
hypothesis : 6.81.2
sphinx : None
blosc : None
feather : None
xlsxwriter : 3.1.2
lxml.etree : 4.9.2
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.8.0
pandas_datareader : None
bs4 : 4.11.1
bottleneck : None
dataframe-api-compat: None
fastparquet : None
fsspec : 2023.3.0
gcsfs : None
matplotlib : 3.6.2
numba : 0.56.4
numexpr : 2.8.4
odfpy : None
openpyxl : 3.0.10
pandas_gbq : None
pyarrow : 13.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : 1.10.0
sqlalchemy : 2.0.21
tables : None
tabulate : 0.9.0
xarray : None
xlrd : 2.0.1
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None