Skip to content

Commit d9f70b3

Browse files
BUG: to_datetime with floats and unit not matching Timestamp (#56037)
* BUG: to_datetime with floats and unit not matching Timestamp * mypy fixup * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <[email protected]> * update for CoW * xfail float32 case * xfail on3 2bit --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent e37ff77 commit d9f70b3

File tree

9 files changed

+126
-55
lines changed

9 files changed

+126
-55
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,7 @@ Datetimelike
461461
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
462462
- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
463463
- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
464+
- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`)
464465
-
465466

466467
Timedelta

pandas/_libs/tslibs/conversion.pxd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1
4545

4646
cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
4747
cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
48-
cpdef (int64_t, int) precision_from_unit(
48+
cdef (int64_t, int) precision_from_unit(
4949
NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
5050
)
5151

pandas/_libs/tslibs/conversion.pyi

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,5 @@ import numpy as np
88
DT64NS_DTYPE: np.dtype
99
TD64NS_DTYPE: np.dtype
1010

11-
def precision_from_unit(
12-
in_reso: int,
13-
out_reso: int = ...,
14-
) -> tuple[int, int]: ... # (int64_t, _)
1511
def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
12+
def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ...

pandas/_libs/tslibs/conversion.pyx

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
cimport cython
2+
13
import numpy as np
24

35
cimport numpy as cnp
46
from libc.math cimport log10
57
from numpy cimport (
8+
float64_t,
69
int32_t,
710
int64_t,
811
)
@@ -37,6 +40,7 @@ from pandas._libs.tslibs.np_datetime cimport (
3740
NPY_DATETIMEUNIT,
3841
NPY_FR_ns,
3942
NPY_FR_us,
43+
astype_overflowsafe,
4044
check_dts_bounds,
4145
convert_reso,
4246
dts_to_iso_string,
@@ -74,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport (
7478
from pandas._libs.tslibs.util cimport (
7579
is_float_object,
7680
is_integer_object,
81+
is_nan,
7782
)
7883

7984
# ----------------------------------------------------------------------
@@ -86,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
8691
# ----------------------------------------------------------------------
8792
# Unit Conversion Helpers
8893

94+
@cython.boundscheck(False)
95+
@cython.wraparound(False)
96+
@cython.overflowcheck(True)
97+
def cast_from_unit_vectorized(
98+
ndarray values,
99+
str unit,
100+
):
101+
"""
102+
Vectorized analogue to cast_from_unit.
103+
"""
104+
cdef:
105+
int64_t m
106+
int p
107+
NPY_DATETIMEUNIT in_reso, out_reso
108+
Py_ssize_t i
109+
110+
assert values.dtype.kind == "f"
111+
112+
if unit in "YM":
113+
if not (((values % 1) == 0) | np.isnan(values)).all():
114+
# GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
115+
# but not clear what 2.5 "M" corresponds to, so we will
116+
# disallow that case.
117+
raise ValueError(
118+
f"Conversion of non-round float with unit={unit} "
119+
"is ambiguous"
120+
)
121+
122+
# GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
123+
# and 150 we'd get 2120-01-01 09:00:00
124+
values = values.astype(f"M8[{unit}]")
125+
dtype = np.dtype("M8[ns]")
126+
return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")
127+
128+
in_reso = abbrev_to_npy_unit(unit)
129+
out_reso = abbrev_to_npy_unit("ns")
130+
m, p = precision_from_unit(in_reso, out_reso)
131+
132+
cdef:
133+
ndarray[int64_t] base, out
134+
ndarray[float64_t] frac
135+
tuple shape = (<object>values).shape
136+
137+
out = np.empty(shape, dtype="i8")
138+
base = np.empty(shape, dtype="i8")
139+
frac = np.empty(shape, dtype="f8")
140+
141+
for i in range(len(values)):
142+
if is_nan(values[i]):
143+
base[i] = NPY_NAT
144+
else:
145+
base[i] = <int64_t>values[i]
146+
frac[i] = values[i] - base[i]
147+
148+
if p:
149+
frac = np.round(frac, p)
150+
151+
try:
152+
for i in range(len(values)):
153+
if base[i] == NPY_NAT:
154+
out[i] = NPY_NAT
155+
else:
156+
out[i] = <int64_t>(base[i] * m) + <int64_t>(frac[i] * m)
157+
except (OverflowError, FloatingPointError) as err:
158+
# FloatingPointError can be issued if we have float dtype and have
159+
# set np.errstate(over="raise")
160+
raise OutOfBoundsDatetime(
161+
f"cannot convert input {values[i]} with the unit '{unit}'"
162+
) from err
163+
return out
164+
165+
89166
cdef int64_t cast_from_unit(
90167
object ts,
91168
str unit,
@@ -155,7 +232,7 @@ cdef int64_t cast_from_unit(
155232
) from err
156233

157234

158-
cpdef (int64_t, int) precision_from_unit(
235+
cdef (int64_t, int) precision_from_unit(
159236
NPY_DATETIMEUNIT in_reso,
160237
NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
161238
):

pandas/core/arrays/timedeltas.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
TYPE_CHECKING,
77
cast,
88
)
9-
import warnings
109

1110
import numpy as np
1211

@@ -27,8 +26,7 @@
2726
npy_unit_to_abbrev,
2827
periods_per_second,
2928
)
30-
from pandas._libs.tslibs.conversion import precision_from_unit
31-
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
29+
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
3230
from pandas._libs.tslibs.fields import (
3331
get_timedelta_days,
3432
get_timedelta_field,
@@ -1059,23 +1057,10 @@ def sequence_to_td64ns(
10591057
data = data._data
10601058
else:
10611059
mask = np.isnan(data)
1062-
# The next few lines are effectively a vectorized 'cast_from_unit'
1063-
m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns"))
1064-
with warnings.catch_warnings():
1065-
# Suppress RuntimeWarning about All-NaN slice
1066-
warnings.filterwarnings(
1067-
"ignore", "invalid value encountered in cast", RuntimeWarning
1068-
)
1069-
base = data.astype(np.int64)
1070-
frac = data - base
1071-
if p:
1072-
frac = np.round(frac, p)
1073-
with warnings.catch_warnings():
1074-
warnings.filterwarnings(
1075-
"ignore", "invalid value encountered in cast", RuntimeWarning
1076-
)
1077-
data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
1060+
1061+
data = cast_from_unit_vectorized(data, unit or "ns")
10781062
data[mask] = iNaT
1063+
data = data.view("m8[ns]")
10791064
copy = False
10801065

10811066
elif lib.is_np_dtype(data.dtype, "m"):

pandas/core/tools/datetimes.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,10 @@
2626
Timestamp,
2727
astype_overflowsafe,
2828
get_unit_from_dtype,
29-
iNaT,
3029
is_supported_unit,
3130
timezones as libtimezones,
3231
)
33-
from pandas._libs.tslibs.conversion import precision_from_unit
34-
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
32+
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
3533
from pandas._libs.tslibs.parsing import (
3634
DateParseError,
3735
guess_datetime_format,
@@ -551,23 +549,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
551549
tz_parsed = None
552550

553551
elif arg.dtype.kind == "f":
554-
mult, _ = precision_from_unit(abbrev_to_npy_unit(unit))
555-
556-
mask = np.isnan(arg) | (arg == iNaT)
557-
fvalues = (arg * mult).astype("f8", copy=False)
558-
fvalues[mask] = 0
559-
560-
if (fvalues < Timestamp.min._value).any() or (
561-
fvalues > Timestamp.max._value
562-
).any():
563-
if errors != "raise":
564-
arg = arg.astype(object)
565-
return _to_datetime_with_unit(arg, unit, name, utc, errors)
566-
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
567-
568-
arr = fvalues.astype("M8[ns]", copy=False)
569-
arr[mask] = np.datetime64("NaT", "ns")
570-
552+
with np.errstate(over="raise"):
553+
try:
554+
arr = cast_from_unit_vectorized(arg, unit=unit)
555+
except OutOfBoundsDatetime:
556+
if errors != "raise":
557+
return _to_datetime_with_unit(
558+
arg.astype(object), unit, name, utc, errors
559+
)
560+
raise OutOfBoundsDatetime(
561+
f"cannot convert input with unit '{unit}'"
562+
)
563+
564+
arr = arr.view("M8[ns]")
571565
tz_parsed = None
572566
else:
573567
arg = arg.astype(object, copy=False)

pandas/tests/io/sas/test_sas7bdat.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,18 @@ def test_date_time(datapath):
187187
fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
188188
)
189189
# GH 19732: Timestamps imported from sas will incur floating point errors
190-
df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
190+
# 2023-11-16 we don't know the correct "expected" result bc we do not have
191+
# access to SAS to read the sas7bdat file. We are really just testing
192+
# that we are "close". This only seems to be an issue near the
193+
# implementation bounds.
194+
res = df.iloc[:, 3].dt.round("us").copy()
195+
196+
# the first and last elements are near the implementation bounds, where we
197+
# would expect floating point error to occur.
198+
res.iloc[0] -= pd.Timedelta(microseconds=1)
199+
res.iloc[-1] += pd.Timedelta(microseconds=1)
200+
201+
df["DateTimeHi"] = res
191202
tm.assert_frame_equal(df, df0)
192203

193204

pandas/tests/tools/test_to_datetime.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,16 +1864,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request):
18641864
result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache)
18651865
tm.assert_index_equal(result, expected)
18661866

1867-
# TODO: this should also work
1868-
if isinstance(item, float):
1869-
request.applymarker(
1870-
pytest.mark.xfail(
1871-
reason=f"{type(item).__name__} in np.array should work"
1872-
)
1873-
)
18741867
result = to_datetime(np.array([item]), unit=unit, cache=cache)
18751868
tm.assert_index_equal(result, expected)
18761869

1870+
# with a nan!
1871+
result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache)
1872+
assert result.isna()[1]
1873+
tm.assert_index_equal(result[:1], expected)
1874+
18771875
@pytest.mark.parametrize("unit", ["Y", "M"])
18781876
def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
18791877
# GH#50301
@@ -1883,6 +1881,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
18831881
msg = f"Conversion of non-round float with unit={unit} is ambiguous"
18841882
with pytest.raises(ValueError, match=msg):
18851883
to_datetime([1.5], unit=unit, errors="raise")
1884+
with pytest.raises(ValueError, match=msg):
1885+
to_datetime(np.array([1.5]), unit=unit, errors="raise")
18861886
with pytest.raises(ValueError, match=msg):
18871887
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
18881888
to_datetime(["1.5"], unit=unit, errors="raise")
@@ -2030,10 +2030,14 @@ def test_unit_mixed(self, cache, arr):
20302030
def test_unit_rounding(self, cache):
20312031
# GH 14156 & GH 20445: argument will incur floating point errors
20322032
# but no premature rounding
2033-
result = to_datetime(1434743731.8770001, unit="s", cache=cache)
2034-
expected = Timestamp("2015-06-19 19:55:31.877000192")
2033+
value = 1434743731.8770001
2034+
result = to_datetime(value, unit="s", cache=cache)
2035+
expected = Timestamp("2015-06-19 19:55:31.877000093")
20352036
assert result == expected
20362037

2038+
alt = Timestamp(value, unit="s")
2039+
assert alt == result
2040+
20372041
def test_unit_ignore_keeps_name(self, cache):
20382042
# GH 21697
20392043
expected = Index([15e9] * 2, name="name")

pandas/tests/tools/test_to_timedelta.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
import pytest
88

9+
from pandas.compat import IS64
910
from pandas.errors import OutOfBoundsTimedelta
1011

1112
import pandas as pd
@@ -232,6 +233,7 @@ def test_to_timedelta_on_missing_values_list(self, val):
232233
actual = to_timedelta([val])
233234
assert actual[0]._value == np.timedelta64("NaT").astype("int64")
234235

236+
@pytest.mark.xfail(not IS64, reason="Floating point error")
235237
def test_to_timedelta_float(self):
236238
# https://github.com/pandas-dev/pandas/issues/25077
237239
arr = np.arange(0, 1, 1e-6)[-10:]

0 commit comments

Comments
 (0)