-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: hashing datetime64 objects #50960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
7761ecd
610b0c6
0e140ae
919383c
ae8c0bb
92a39eb
2f67805
0635f86
229ab72
6e96805
24fda82
058b666
7398991
3fdf564
6e4836e
f55337a
a97dfc9
1338ca2
9fb1987
818682c
74ab540
037ba05
7a4b1ab
47e5247
dcd09dd
32d479b
d47cfd8
c091317
1ce791e
704fb69
6d962b0
f838953
0d8500a
58a29b6
95069e0
7362f3e
dd08670
998a4cc
b75730b
5c57a5e
6b4460f
c94609b
afe9493
4fecc97
b4cc41e
c620339
3633653
c55f182
6e2bbf0
23c2826
143b3a3
ffb8365
1fdfd64
5513721
875d6af
a29a56a
40e6e17
15a701c
af25f40
9d5cb46
d5a031d
bd7d432
394d86e
1766bc3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -7,7 +7,10 @@ | |||||||||||||||||
typedef npy_complex64 khcomplex64_t; | ||||||||||||||||||
typedef npy_complex128 khcomplex128_t; | ||||||||||||||||||
|
||||||||||||||||||
// get pandas_datetime_to_datetimestruct | ||||||||||||||||||
#include <../../tslibs/src/datetime/np_datetime.h> | ||||||||||||||||||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
|
||||||||||||||||||
#include "datetime.h" | ||||||||||||||||||
|
||||||||||||||||||
// khash should report usage to tracemalloc | ||||||||||||||||||
#if PY_VERSION_HEX >= 0x03060000 | ||||||||||||||||||
|
@@ -330,6 +333,129 @@ Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { | |||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
// TODO: if nanos is most common/important, might be most performant to | ||||||||||||||||||
// make that canonical and others cast to that? | ||||||||||||||||||
Py_hash_t PANDAS_INLINE hash_datetime_value_and_reso(npy_datetime value, NPY_DATETIMEUNIT unit, npy_datetimestruct* dts) { | ||||||||||||||||||
// If we cannot cast to pydatetime, then the question is if there are | ||||||||||||||||||
// other-resolution datetime64 objects that we might be equal to whose | ||||||||||||||||||
// hashes we need to match. We let year-reso objects return value, and make | ||||||||||||||||||
// higher-resolution cases responsible for checking of they match. | ||||||||||||||||||
if (unit == NPY_FR_Y) { | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_M) { | ||||||||||||||||||
if ((value % 12) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 12, NPY_FR_Y, dts); | ||||||||||||||||||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_W) { | ||||||||||||||||||
if (dts->day == 1) { | ||||||||||||||||||
value = (dts->year - 1970) * 12 + dts->month; | ||||||||||||||||||
return hash_datetime_value_and_reso(value, NPY_FR_M, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_D) { | ||||||||||||||||||
if ((value % 7) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 7, NPY_FR_W, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_h) { | ||||||||||||||||||
if ((value % 24) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 24, NPY_FR_D, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_m) { | ||||||||||||||||||
if ((value % 60) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 60, NPY_FR_h, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_s) { | ||||||||||||||||||
if ((value % 60) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 60, NPY_FR_m, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_ms) { | ||||||||||||||||||
if ((value % 1000) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 1000, NPY_FR_s, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_us) { | ||||||||||||||||||
if ((value % 1000) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 1000, NPY_FR_ns, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_ns) { | ||||||||||||||||||
if ((value % 1000) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 1000, NPY_FR_us, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_ps) { | ||||||||||||||||||
if ((value % 1000) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 1000, NPY_FR_ns, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_fs) { | ||||||||||||||||||
if ((value % 1000) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 1000, NPY_FR_ps, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else if (unit == NPY_FR_as) { | ||||||||||||||||||
if ((value % 1000) == 0) { | ||||||||||||||||||
return hash_datetime_value_and_reso(value / 1000, NPY_FR_fs, dts); | ||||||||||||||||||
} | ||||||||||||||||||
return value; | ||||||||||||||||||
} | ||||||||||||||||||
else { | ||||||||||||||||||
// i.e. NPY_FR_GENERIC | ||||||||||||||||||
// we default to treating these like nanos | ||||||||||||||||||
return hash_datetime_value_and_reso(value, NPY_FR_ns, dts); | ||||||||||||||||||
} | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
// TODO: same thing for timedelta64 objects | ||||||||||||||||||
Py_hash_t np_datetime64_object_hash(PyObject* key) { | ||||||||||||||||||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
// GH#50690 numpy's hash implementation does not preserve comparabity | ||||||||||||||||||
// either across resolutions or with standard library objects. | ||||||||||||||||||
// See also Timestamp.__hash__ | ||||||||||||||||||
|
||||||||||||||||||
NPY_DATETIMEUNIT unit = (NPY_DATETIMEUNIT)((PyDatetimeScalarObject*)key)->obmeta.base; | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should tighten up the error handling instead of just doing a blind cast. From a quick glance in the numpy source I think they have functions like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is copy-pasted from our get_datetime64_unit and get_datetime64_value. i haven't found a way to do a c/cython declaration for a np.datetime64 object other than PyObject*/object. open to suggestion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea there's nothing wrong with the PyObject * type, but we are missing the runtime type introspection here and where you copied it from. Does the suggestion above not compile? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so start out with something like?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea that's definitely headed in the right direction There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You return -1 here, but that would work if you want to have runtime checking here (which I consider up to you, could add an assert if you don't want it). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea still want to make sure we have a runtime check; internally our error handling is a mess with extensions so refactoring is a huge effort. Hoping to have a more consistent standard going forward There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The PyDatetimeScalarObject cast here shouldn't be needed any more since you refactored the function signature. Does numpy provide a macro to access the unit? Would be even better if we can not do any casting on our end There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On review of some numpy internals I don't even think you need a macro. Can blow away the casts altogether - if that doesn't work let me know and can take a closer look |
||||||||||||||||||
npy_datetime value = ((PyDatetimeScalarObject*)key)->obval; | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can remove this cast too |
||||||||||||||||||
npy_datetimestruct dts; | ||||||||||||||||||
PyObject* dt; | ||||||||||||||||||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
|
||||||||||||||||||
if (value == NPY_DATETIME_NAT) { | ||||||||||||||||||
// np.datetime64("NaT") in any reso | ||||||||||||||||||
return NPY_DATETIME_NAT; | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
pandas_datetime_to_datetimestruct(value, unit, &dts); | ||||||||||||||||||
|
||||||||||||||||||
if ((dts.year > 0) && (dts.year <= 9999) && (dts.ps == 0) && (dts.as == 0)) { | ||||||||||||||||||
// we CAN cast to pydatetime, so use that hash to ensure we compare | ||||||||||||||||||
// as matching standard library datetimes (and pd.Timestamps) | ||||||||||||||||||
PyDateTime_IMPORT; | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to move this out of the function body? Maybe it can go into the global hashtable code instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that causes a complaint at build time. there is another use of PyDateTime_IMPORT in the json code that is also runtime There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That does not seem clean to be honest. You are effectively doing a lookup This being in a header makes things a bit tricky maybe. The import can be on the top level of the cython file, but not of a C-file and thus even less of a header. If this is only used in Cython, then you could just put it there (if it gets used without the import you get a segfault, which should be relatively clean to read in gdb/lldb, but of course not super easy, maybe a comment on the If this was in a C file, you would need to add a Long story short: This is used once, so probably move There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So does this not work to move to the global hashtable code in the cython file? Also agree with @seberg would like to avoid using the macro here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ill defer to you guys on this, but im uncomfortable with having this c file have an implicit dependency on the higher-level cython file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note there's also a coment in np_datetime.c saying it would be helpful to get PyDateTime_IMPORT to work in that file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the implicit dependency is not nice, but I think that is just something to live with. You need some initialization, which needs to be a function call and that cannot happen in a header file. I would just say: If you forget, you get a clean segfault pointing at the first use of Datetime code in gdb/lldb. We put a comment there explaining things. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just some (hopefully helpful) context - the header file never gets "executed" in any sense, so just putting the macro in a header file doesn't by itself just run the code you've inserted. The entry point for C extensions is a module initialization function like you see in the Python documentation - this gets executed when the Python interpreter loads the extension: I am assuming that the global namespace of the Cython file is akin to the module initialization; if that is true then it would be natural for the import to go there rather than the header. With that said the workaround suggested here is fine, but I don't think the dependency problem you were worried about is too big of a deal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I think this is decent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ops had, added an error check, although There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pushed commit with this (but not the NULL check) and it failed on the CI, so reverted it for now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had misspelled |
||||||||||||||||||
dt = PyDateTime_FromDateAndTime( | ||||||||||||||||||
dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us | ||||||||||||||||||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
); | ||||||||||||||||||
return PyObject_Hash(dt); | ||||||||||||||||||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing the
I would not add explicit error handling in this case as we are at the end of the function. The error we return from the function is -1, so is the value you would guard against. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I now doubt that -1 is even reachable as a value here, since anything that might turn out as -1 would be handled by the Python hasher. So if you want to be pedantic and find out if this should be there, have to return -1 explicitly, I guess. |
||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
return hash_datetime_value_and_reso(value, unit, &dts); | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not -1 with PyErr being set? AFAICT that is what the cpython docs suggest say to do: https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_hash I'm a bit wary of adding -2 to our pattern for error handling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes of course -1 must be returned with an error set. The point is that for hashing you must not return -1 if there wasn't an error. And the docs are a bit fuzzy, but Python assumes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note:
Nothing is added to error handling, the point is you do not have to check (Of course you can make possibly make a different choice likely if you do not want to use this ever for |
||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { | ||||||||||||||||||
Py_hash_t hash; | ||||||||||||||||||
// For PyObject_Hash holds: | ||||||||||||||||||
|
@@ -351,6 +477,10 @@ khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { | |||||||||||||||||
else if (PyTuple_CheckExact(key)) { | ||||||||||||||||||
hash = tupleobject_hash((PyTupleObject*)key); | ||||||||||||||||||
} | ||||||||||||||||||
else if (PyObject_TypeCheck(key, &PyDatetimeArrType_Type)) { | ||||||||||||||||||
// GH#50690 | ||||||||||||||||||
hash = np_datetime64_object_hash(key); | ||||||||||||||||||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
} | ||||||||||||||||||
else { | ||||||||||||||||||
hash = PyObject_Hash(key); | ||||||||||||||||||
} | ||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -442,7 +442,8 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): | |
"_libs.algos": { | ||
"pyxfile": "_libs/algos", | ||
"include": klib_include, | ||
"depends": _pxi_dep["algos"], | ||
"depends": _pxi_dep["algos"] + tseries_depends, | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the point of the capsule is you don't specify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’m still struggling here with builds failures I cant reproduce. Any other thoughts? |
||
}, | ||
"_libs.arrays": {"pyxfile": "_libs/arrays"}, | ||
"_libs.groupby": {"pyxfile": "_libs/groupby"}, | ||
|
@@ -453,21 +454,30 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): | |
"depends": ( | ||
["pandas/_libs/src/klib/khash_python.h", "pandas/_libs/src/klib/khash.h"] | ||
+ _pxi_dep["hashtable"] | ||
+ tseries_depends | ||
), | ||
"sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], | ||
}, | ||
"_libs.index": { | ||
"pyxfile": "_libs/index", | ||
"include": klib_include, | ||
"depends": _pxi_dep["index"], | ||
"depends": _pxi_dep["index"] + tseries_depends, | ||
"sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], | ||
}, | ||
"_libs.indexing": {"pyxfile": "_libs/indexing"}, | ||
"_libs.internals": {"pyxfile": "_libs/internals"}, | ||
"_libs.interval": { | ||
"pyxfile": "_libs/interval", | ||
"include": klib_include, | ||
"depends": _pxi_dep["interval"], | ||
"depends": _pxi_dep["interval"] + tseries_depends, | ||
"sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You shouldn't add any new sources arguments to setup.py - this is what causes undefined symbols during parallel compilation with setuptools There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reverted that and now im getting a different failure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea I think you also need to move the implementation out of the header file into the capsule to resolve that |
||
}, | ||
"_libs.join": { | ||
"pyxfile": "_libs/join", | ||
"depends": tseries_depends, | ||
"include": klib_include, | ||
"sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], | ||
}, | ||
"_libs.join": {"pyxfile": "_libs/join", "include": klib_include}, | ||
"_libs.lib": { | ||
"pyxfile": "_libs/lib", | ||
"depends": lib_depends + tseries_depends, | ||
|
@@ -481,10 +491,12 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): | |
"depends": [ | ||
"pandas/_libs/src/parser/tokenizer.h", | ||
"pandas/_libs/src/parser/io.h", | ||
], | ||
] | ||
+ tseries_depends, | ||
"sources": [ | ||
"pandas/_libs/src/parser/tokenizer.c", | ||
"pandas/_libs/src/parser/io.c", | ||
"pandas/_libs/tslibs/src/datetime/np_datetime.c", | ||
], | ||
}, | ||
"_libs.reduction": {"pyxfile": "_libs/reduction"}, | ||
|
Uh oh!
There was an error while loading. Please reload this page.