Skip to content

pandas >= 1.0.0 fails to read an old HDF5 file #33186

Open
@lschr

Description

@lschr

Code Sample, a copy-pastable example if possible

import pandas as pd
pd.read_hdf("beads1.h5", "features")

Problem description

While working on pandas 0.25, this fails starting with pandas 1.0.0. The HDF file was created in November 2015 or even earlier and is inside the zip file. Other files created at the same time seem to work.

The error message is:

TypeError                                 Traceback (most recent call last)
<ipython-input-2-507ed8d14986> in <module>
----> 1 pd.read_hdf("Software/sdt-python/tests/data_chromatic/beads1.h5", "features")

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in read_hdf(path_or_buf, key, mode, errors, where, start, stop, columns, iterator, chunksize, **kwargs)
    426             iterator=iterator,
    427             chunksize=chunksize,
--> 428             auto_close=auto_close,
    429         )
    430     except (ValueError, TypeError, KeyError):

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in select(self, key, where, start, stop, columns, iterator, chunksize, auto_close)
    812         )
    813 
--> 814         return it.get_result()
    815 
    816     def select_as_coordinates(

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in get_result(self, coordinates)
   1827 
   1828         # directly return the result
-> 1829         results = self.func(self.start, self.stop, where)
   1830         self.close()
   1831         return results

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in func(_start, _stop, _where)
    796         # function to call on iteration
    797         def func(_start, _stop, _where):
--> 798             return s.read(start=_start, stop=_stop, where=_where, columns=columns)
    799 
    800         # create the iterator

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in read(self, where, columns, start, stop)
   3067         for i in range(self.nblocks):
   3068 
-> 3069             blk_items = self.read_index(f"block{i}_items")
   3070             values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
   3071 

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in read_index(self, key, start, stop)
   2757         elif variety == "regular":
   2758             node = getattr(self.group, key)
-> 2759             index = self.read_index_node(node, start=start, stop=stop)
   2760             return index
   2761         else:  # pragma: no cover

~/anaconda3/lib/python3.7/site-packages/pandas/io/pytables.py in read_index_node(self, node, start, stop)
   2878                     data, kind, encoding=self.encoding, errors=self.errors
   2879                 ),
-> 2880                 **kwargs,
   2881             )
   2882 

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
    407                 if new_dtype is not None:
    408                     return cls(
--> 409                         new_data, dtype=new_dtype, copy=False, name=name, **kwargs
    410                     )
    411 

~/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
    411 
    412             if kwargs:
--> 413                 raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}")
    414             if subarr.ndim > 1:
    415                 # GH#13601, GH#20285, GH#27125

TypeError: Unexpected keyword arguments {'freq'}

Expected Output

             x           y    z       signal          bg          mass  \
0   128.664558    2.966443  0.0  4189.596881  417.007853  27574.470714   
1   355.112881    3.865741  0.0  1061.439002  266.074445  12902.725487   
2   169.607263    6.025977  0.0  3103.641263  273.718417  22003.694114   
3   396.070422    6.681506  0.0   799.983142  227.225677   9527.816676   
4   127.806587    9.435608  0.0  4543.589962  332.126465  33712.910416   
..         ...         ...  ...          ...         ...           ...   
61  284.447272   56.068638  0.0   502.772436  221.098239   7438.331854   
62  381.438571   96.586069  0.0   287.953711  227.066782   1912.087223   
63  217.752114    3.596804  0.0   150.887499  206.701723   1118.474095   
64  258.435664   54.131181  0.0   125.772954  209.084140   2859.549002   
65  328.954684  100.785512  0.0   506.879163  224.691759   5695.560215   

        size  frame  
0   1.023476      0  
1   1.390924      0  
2   1.062239      0  
3   1.376786      0  
4   1.086697      0  
..       ...    ...  
61  1.534483      0  
62  1.028022      0  
63  1.086166      0  
64  1.902240      0  
65  1.337291      0  

[66 rows x 8 columns]

Output of pd.show_versions()

INSTALLED VERSIONS

commit : None
python : 3.7.7.final.0
python-bits : 64
OS : Linux
OS-release : 4.15.0-88-generic
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : de_AT.UTF-8
LOCALE : de_AT.UTF-8

pandas : 1.0.3
numpy : 1.18.1
pytz : 2019.3
dateutil : 2.8.1
pip : 20.0.2
setuptools : 46.1.3.post20200330
Cython : None
pytest : 5.4.1
hypothesis : None
sphinx : 2.4.4
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.5.0
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.1
IPython : 7.13.0
pandas_datareader: None
bs4 : 4.8.2
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : 4.5.0
matplotlib : 3.1.3
numexpr : 2.7.1
odfpy : None
openpyxl : 3.0.3
pandas_gbq : None
pyarrow : None
pytables : None
pytest : 5.4.1
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : 3.4.4
tabulate : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None
numba : 0.48.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions