Closed
Description
In [1]: import pandas as pd
In [2]: pd.Series
Out[2]: pandas.core.series.Series
In [4]: s = pd.Series(pd.interval_range(0, periods=10))
In [5]: pd.util.hash_pandas_object(s)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-1b7247db4f16> in <module>
----> 1 pd.util.hash_pandas_object(s)
~/sandbox/pandas/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize)
88 elif isinstance(obj, ABCSeries):
89 h = hash_array(obj.values, encoding, hash_key,
---> 90 categorize).astype('uint64', copy=False)
91 if index:
92 index_iter = (hash_pandas_object(obj.index,
~/sandbox/pandas/pandas/core/util/hashing.py in hash_array(vals, encoding, hash_key, categorize)
269 # we'll be working with everything as 64-bit values, so handle this
270 # 128-bit value early
--> 271 elif np.issubdtype(dtype, np.complex128):
272 return hash_array(vals.real) + 23 * hash_array(vals.imag)
273
~/Envs/pandas-dev/lib/python3.7/site-packages/numpy/core/numerictypes.py in issubdtype(arg1, arg2)
712 """
713 if not issubclass_(arg1, generic):
--> 714 arg1 = dtype(arg1).type
715 if not issubclass_(arg2, generic):
716 arg2_orig = arg2
TypeError: data type not understood
In [6]: s
Out[6]:
0 (0, 1]
1 (1, 2]
2 (2, 3]
3 (3, 4]
4 (4, 5]
5 (5, 6]
6 (6, 7]
7 (7, 8]
8 (8, 9]
9 (9, 10]
dtype: interval
Options
- convert to object before hashing
- add some kind of
_hash_values
to the interface. But, how do we prevent hash collisions between similar, but different EAs? For example, the fastest hash for aPeriodArray
would be to just hash the ordinals. But we wouldn't want the following two to hash identically (using my PeriodArray branch)
In [35]: pd.core.arrays.PeriodArray._from_ordinals([10, 20], freq='H')
Out[35]:
<pandas PeriodArray>
['1970-01-01 10:00', '1970-01-01 20:00']
Length: 2, dtype: period[H]
In [36]: pd.core.arrays.PeriodArray._from_ordinals([10, 20], freq='D')
Out[36]:
<pandas PeriodArray>
['1970-01-11', '1970-01-21']
Length: 2, dtype: period[D]
So we need to mix the dtype information in too.