Skip to content

BUG: DataFrame from dict with non-nano Timedelta #48901

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/timedeltas.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ cdef class _Timedelta(timedelta):

cpdef timedelta to_pytimedelta(_Timedelta self)
cdef bint _has_ns(self)
cdef bint _is_in_pytimedelta_bounds(self)
cdef _ensure_components(_Timedelta self)
cdef inline bint _compare_mismatched_resos(self, _Timedelta other, op)
cdef _Timedelta _as_creso(self, NPY_DATETIMEUNIT reso, bint round_ok=*)
Expand Down
28 changes: 27 additions & 1 deletion pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1093,8 +1093,27 @@ cdef class _Timedelta(timedelta):
# non-invariant behavior.
# see GH#44504
return hash(self.value)
else:
elif self._is_in_pytimedelta_bounds() and (
self._creso == NPY_FR_ns or self._creso == NPY_DATETIMEUNIT.NPY_FR_us
):
# If we can defer to timedelta.__hash__, do so, as that
# ensures the hash is invariant to our _reso.
# We can only defer for ns and us, as for these two resos we
# call _Timedelta.__new__ with the correct input in
# _timedelta_from_value_and_reso; so timedelta.__hash__
# will be correct
return timedelta.__hash__(self)
else:
# We want to ensure that two equivalent Timedelta objects
# have the same hash. So we try downcasting to the next-lowest
# resolution.
try:
obj = (<_Timedelta>self)._as_creso(<NPY_DATETIMEUNIT>(self._creso + 1))
except OverflowError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do the unit test hit both the except and else branch?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just reach one branch, will add test(s)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did these get added?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, not yet

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated per request

# Doesn't fit, so we're off the hook
return hash(self.value)
else:
return hash(obj)

def __richcmp__(_Timedelta self, object other, int op):
cdef:
Expand Down Expand Up @@ -1152,6 +1171,13 @@ cdef class _Timedelta(timedelta):
else:
raise NotImplementedError(self._creso)

cdef bint _is_in_pytimedelta_bounds(self):
"""
Check if we are within the bounds of datetime.timedelta.
"""
self._ensure_components()
return -999999999 <= self._d and self._d <= 999999999

cdef _ensure_components(_Timedelta self):
"""
compute the components
Expand Down
33 changes: 14 additions & 19 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,30 +846,19 @@ def create_data(constructor):
tm.assert_frame_equal(result_Timestamp, expected)

@pytest.mark.parametrize(
"klass",
"klass,name",
[
pytest.param(
np.timedelta64,
marks=pytest.mark.xfail(
reason="hash mismatch (GH#44504) causes lib.fast_multiget "
"to mess up on dict lookups with equal Timedeltas with "
"mismatched resos"
),
),
timedelta,
Timedelta,
(lambda x: np.timedelta64(x, "D"), "timedelta64"),
(lambda x: timedelta(days=x), "pytimedelta"),
(lambda x: Timedelta(x, "D"), "Timedelta[ns]"),
(lambda x: Timedelta(x, "D").as_unit("s"), "Timedelta[s]"),
],
)
def test_constructor_dict_timedelta64_index(self, klass):
def test_constructor_dict_timedelta64_index(self, klass, name):
# GH 10160
td_as_int = [1, 2, 3, 4]

if klass is timedelta:
constructor = lambda x: timedelta(days=x)
else:
constructor = lambda x: klass(x, "D")

data = {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)}
data = {i: {klass(s): 2 * i} for i, s in enumerate(td_as_int)}

expected = DataFrame(
[
Expand All @@ -881,7 +870,13 @@ def test_constructor_dict_timedelta64_index(self, klass):
index=[Timedelta(td, "D") for td in td_as_int],
)

result = DataFrame(data)
if name == "Timedelta[s]":
# TODO(2.0): passing index here shouldn't be necessary, is for now
# otherwise we raise in _extract_index
result = DataFrame(data, index=expected.index)
else:
result = DataFrame(data)

tm.assert_frame_equal(result, expected)

def test_constructor_period_dict(self):
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest

from pandas._libs import (
Timedelta,
lib,
writers as libwriters,
)
Expand Down Expand Up @@ -42,6 +43,34 @@ def test_fast_unique_multiple_list_gen_sort(self):
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
tm.assert_numpy_array_equal(np.array(out), expected)

def test_fast_multiget_timedelta_resos(self):
# This will become relevant for test_constructor_dict_timedelta64_index
# once Timedelta constructor preserves reso when passed a
# np.timedelta64 object
td = Timedelta(days=1)

mapping1 = {td: 1}
mapping2 = {td.as_unit("s"): 1}

oindex = Index([td * n for n in range(3)])._values.astype(object)

expected = lib.fast_multiget(mapping1, oindex)
result = lib.fast_multiget(mapping2, oindex)
tm.assert_numpy_array_equal(result, expected)

# case that can't be cast to td64ns
td = Timedelta(np.timedelta64(400, "Y"))
assert hash(td) == hash(td.as_unit("ms"))
assert hash(td) == hash(td.as_unit("us"))
mapping1 = {td: 1}
mapping2 = {td.as_unit("ms"): 1}

oindex = Index([td * n for n in range(3)])._values.astype(object)

expected = lib.fast_multiget(mapping1, oindex)
result = lib.fast_multiget(mapping2, oindex)
tm.assert_numpy_array_equal(result, expected)


class TestIndexing:
def test_maybe_indices_to_slice_left_edge(self):
Expand Down