Description
Code Sample
import pandas as pd
import numpy as np
class TSDF(pd.DataFrame):
_metadata = ['time_colname', 'ts_value_colname']
def __init__(self, data, time_colname, index=None, columns=None,
dtype=None, copy=True,
ts_value_colname=None):
super(TSDF, self).__init__(data=data, index=None, columns=columns,
dtype=dtype, copy=copy)
self.time_colname = time_colname
self.ts_value_colname = ts_value_colname
@property
def time_colname(self):
"""
Time axis column name
"""
return self.__time_colname
@time_colname.setter
def time_colname(self, val):
if val is None:
self.__time_colname = val
else:
if isinstance(val, str):
if val not in self.columns and val not in self.index.names:
raise('column not in data frame')
self.__time_colname = val
@property
def ts_value_colname(self):
"""
Column name of target value.
"""
return self.__ts_value_colname
@ts_value_colname.setter
def ts_value_colname(self, val):
if val is None:
self.__ts_value_colname = val
else:
if isinstance(val, str):
if val not in self.columns:
raise('column not in dataframe')
if self.shape[0] > 0 and \
not all(
[isinstance(v, (int, float, np.number)) for v in
self[val]]):
raise('the ts_value_colname column is not numeirc')
self.__ts_value_colname = val
data = {'date': pd.to_datetime(['2017-01-01', '2017-01-02', '2017-01-03',
'2017-01-04', '2017-01-05', '2017-01-06']),
'actual': [1., 2., 2., 3., 3., 4.]
}
tsdf = TSDF(data, time_colname='date', ts_value_colname='actual')
tsdf.to_pickle('test.pkl')
tsdf_pkl = pd.read_pickle('test.pkl')
Problem description
I'm creating a subclass of pandas.DataFrame. I have some complicated attribute setters, because I wanted to enforce certain data structure to avoid error in downstream process. I'm having trouble unpickling objects of my subclass and I think I have identified the cause.
If you could confirm that my understanding below is correct, I'm happy to create a PR for this.
- In the _setstate_ method in generic.py, in the following code,
set(self._internal_names + self._metadata)
returns an unordered set, which means self._metadata could be set before self._internal_names are set. This becomes problematic when my attribute setter needs to access the object that’s not fully set yet. I got "AttributeError: 'TimeSeriesDataFrame' object has no attribute '_data'"
# set in the order of internal names
# to avoid definitional recursion
# e.g. say fill_value needing _data to be
# defined
meta = set(self._internal_names + self._metadata)
for k in list(meta):
if k in state:
v = state[k]
object.__setattr__(self, k, v)
- In the _getstate_ method in generic.py, _item_cache is not returned as part of the state, which caused "AttributeError: 'TimeSeriesDataFrame' object has no attribute '_item_cache', when my attribute setter contains some code that need to access _item_cache.
def __getstate__(self):
meta = {k: getattr(self, k, None) for k in self._metadata}
return dict(_data=self._data, _typ=self._typ,
_metadata=self._metadata,
**meta)
Output of pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 3.5.2.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 61 Stepping 4, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None
pandas: 0.23.0
pytest: 3.6.1
pip: 10.0.1
setuptools: 39.2.0
Cython: 0.28.3
numpy: 1.14.3
scipy: 1.1.0
pyarrow: None
xarray: None
IPython: 6.4.0
sphinx: None
patsy: 0.5.0
dateutil: 2.7.3
pytz: 2018.4
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.2.2
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 0.9999999
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: None
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None