BUG: to_datetime with floats and unit not matching Timestamp (#56037)

jbrockmendel · mroeschke · web-flow · commit d9f70b397a01 · 2023-11-22T11:11:53.000-08:00
* BUG: to_datetime with floats and unit not matching Timestamp

* mypy fixup

* Update doc/source/whatsnew/v2.2.0.rst

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* update for CoW

* xfail float32 case

* xfail on3 2bit

---------

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -461,6 +461,7 @@ Datetimelike
 - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
 - Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
 - Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
+- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`)
 -
 
 Timedelta
diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd
@@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1
 
 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
 cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
-cpdef (int64_t, int) precision_from_unit(
+cdef (int64_t, int) precision_from_unit(
     NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*
 )
 
diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi
@@ -8,8 +8,5 @@ import numpy as np
 DT64NS_DTYPE: np.dtype
 TD64NS_DTYPE: np.dtype
 
-def precision_from_unit(
-    in_reso: int,
-    out_reso: int = ...,
-) -> tuple[int, int]: ...  # (int64_t, _)
 def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
+def cast_from_unit_vectorized(values: np.ndarray, unit: str) -> np.ndarray: ...
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -1,8 +1,11 @@
+cimport cython
+
 import numpy as np
 
 cimport numpy as cnp
 from libc.math cimport log10
 from numpy cimport (
+    float64_t,
     int32_t,
     int64_t,
 )
@@ -37,6 +40,7 @@ from pandas._libs.tslibs.np_datetime cimport (
     NPY_DATETIMEUNIT,
     NPY_FR_ns,
     NPY_FR_us,
+    astype_overflowsafe,
     check_dts_bounds,
     convert_reso,
     dts_to_iso_string,
@@ -74,6 +78,7 @@ from pandas._libs.tslibs.tzconversion cimport (
 from pandas._libs.tslibs.util cimport (
     is_float_object,
     is_integer_object,
+    is_nan,
 )
 
 # ----------------------------------------------------------------------
@@ -86,6 +91,78 @@ TD64NS_DTYPE = np.dtype("m8[ns]")
 # ----------------------------------------------------------------------
 # Unit Conversion Helpers
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.overflowcheck(True)
+def cast_from_unit_vectorized(
+    ndarray values,
+    str unit,
+):
+    """
+    Vectorized analogue to cast_from_unit.
+    """
+    cdef:
+        int64_t m
+        int p
+        NPY_DATETIMEUNIT in_reso, out_reso
+        Py_ssize_t i
+
+    assert values.dtype.kind == "f"
+
+    if unit in "YM":
+        if not (((values % 1) == 0) | np.isnan(values)).all():
+            # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
+            #  but not clear what 2.5 "M" corresponds to, so we will
+            #  disallow that case.
+            raise ValueError(
+                f"Conversion of non-round float with unit={unit} "
+                "is ambiguous"
+            )
+
+        # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
+        #  and 150 we'd get 2120-01-01 09:00:00
+        values = values.astype(f"M8[{unit}]")
+        dtype = np.dtype("M8[ns]")
+        return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8")
+
+    in_reso = abbrev_to_npy_unit(unit)
+    out_reso = abbrev_to_npy_unit("ns")
+    m, p = precision_from_unit(in_reso, out_reso)
+
+    cdef:
+        ndarray[int64_t] base, out
+        ndarray[float64_t] frac
+        tuple shape = (<object>values).shape
+
+    out = np.empty(shape, dtype="i8")
+    base = np.empty(shape, dtype="i8")
+    frac = np.empty(shape, dtype="f8")
+
+    for i in range(len(values)):
+        if is_nan(values[i]):
+            base[i] = NPY_NAT
+        else:
+            base[i] = <int64_t>values[i]
+            frac[i] = values[i] - base[i]
+
+    if p:
+        frac = np.round(frac, p)
+
+    try:
+        for i in range(len(values)):
+            if base[i] == NPY_NAT:
+                out[i] = NPY_NAT
+            else:
+                out[i] = <int64_t>(base[i] * m) + <int64_t>(frac[i] * m)
+    except (OverflowError, FloatingPointError) as err:
+        # FloatingPointError can be issued if we have float dtype and have
+        #  set np.errstate(over="raise")
+        raise OutOfBoundsDatetime(
+            f"cannot convert input {values[i]} with the unit '{unit}'"
+        ) from err
+    return out
+
+
 cdef int64_t cast_from_unit(
     object ts,
     str unit,
@@ -155,7 +232,7 @@ cdef int64_t cast_from_unit(
         ) from err
 
 
-cpdef (int64_t, int) precision_from_unit(
+cdef (int64_t, int) precision_from_unit(
     NPY_DATETIMEUNIT in_reso,
     NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
 ):
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -6,7 +6,6 @@
     TYPE_CHECKING,
     cast,
 )
-import warnings
 
 import numpy as np
 
@@ -27,8 +26,7 @@
     npy_unit_to_abbrev,
     periods_per_second,
 )
-from pandas._libs.tslibs.conversion import precision_from_unit
-from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
+from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
 from pandas._libs.tslibs.fields import (
     get_timedelta_days,
     get_timedelta_field,
@@ -1059,23 +1057,10 @@ def sequence_to_td64ns(
             data = data._data
         else:
             mask = np.isnan(data)
-        # The next few lines are effectively a vectorized 'cast_from_unit'
-        m, p = precision_from_unit(abbrev_to_npy_unit(unit or "ns"))
-        with warnings.catch_warnings():
-            # Suppress RuntimeWarning about All-NaN slice
-            warnings.filterwarnings(
-                "ignore", "invalid value encountered in cast", RuntimeWarning
-            )
-            base = data.astype(np.int64)
-        frac = data - base
-        if p:
-            frac = np.round(frac, p)
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                "ignore", "invalid value encountered in cast", RuntimeWarning
-            )
-            data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
+
+        data = cast_from_unit_vectorized(data, unit or "ns")
         data[mask] = iNaT
+        data = data.view("m8[ns]")
         copy = False
 
     elif lib.is_np_dtype(data.dtype, "m"):
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -26,12 +26,10 @@
     Timestamp,
     astype_overflowsafe,
     get_unit_from_dtype,
-    iNaT,
     is_supported_unit,
     timezones as libtimezones,
 )
-from pandas._libs.tslibs.conversion import precision_from_unit
-from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
+from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
 from pandas._libs.tslibs.parsing import (
     DateParseError,
     guess_datetime_format,
@@ -551,23 +549,19 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
             tz_parsed = None
 
         elif arg.dtype.kind == "f":
-            mult, _ = precision_from_unit(abbrev_to_npy_unit(unit))
-
-            mask = np.isnan(arg) | (arg == iNaT)
-            fvalues = (arg * mult).astype("f8", copy=False)
-            fvalues[mask] = 0
-
-            if (fvalues < Timestamp.min._value).any() or (
-                fvalues > Timestamp.max._value
-            ).any():
-                if errors != "raise":
-                    arg = arg.astype(object)
-                    return _to_datetime_with_unit(arg, unit, name, utc, errors)
-                raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
-
-            arr = fvalues.astype("M8[ns]", copy=False)
-            arr[mask] = np.datetime64("NaT", "ns")
-
+            with np.errstate(over="raise"):
+                try:
+                    arr = cast_from_unit_vectorized(arg, unit=unit)
+                except OutOfBoundsDatetime:
+                    if errors != "raise":
+                        return _to_datetime_with_unit(
+                            arg.astype(object), unit, name, utc, errors
+                        )
+                    raise OutOfBoundsDatetime(
+                        f"cannot convert input with unit '{unit}'"
+                    )
+
+            arr = arr.view("M8[ns]")
             tz_parsed = None
         else:
             arg = arg.astype(object, copy=False)
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -187,7 +187,18 @@ def test_date_time(datapath):
         fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
     )
     # GH 19732: Timestamps imported from sas will incur floating point errors
-    df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
+    # 2023-11-16 we don't know the correct "expected" result bc we do not have
+    #  access to SAS to read the sas7bdat file. We are really just testing
+    #  that we are "close". This only seems to be an issue near the
+    #  implementation bounds.
+    res = df.iloc[:, 3].dt.round("us").copy()
+
+    # the first and last elements are near the implementation bounds, where we
+    #  would expect floating point error to occur.
+    res.iloc[0] -= pd.Timedelta(microseconds=1)
+    res.iloc[-1] += pd.Timedelta(microseconds=1)
+
+    df["DateTimeHi"] = res
     tm.assert_frame_equal(df, df0)
 
 
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -1864,16 +1864,14 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request):
         result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache)
         tm.assert_index_equal(result, expected)
 
-        # TODO: this should also work
-        if isinstance(item, float):
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason=f"{type(item).__name__} in np.array should work"
-                )
-            )
         result = to_datetime(np.array([item]), unit=unit, cache=cache)
         tm.assert_index_equal(result, expected)
 
+        # with a nan!
+        result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache)
+        assert result.isna()[1]
+        tm.assert_index_equal(result[:1], expected)
+
     @pytest.mark.parametrize("unit", ["Y", "M"])
     def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
         # GH#50301
@@ -1883,6 +1881,8 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
         msg = f"Conversion of non-round float with unit={unit} is ambiguous"
         with pytest.raises(ValueError, match=msg):
             to_datetime([1.5], unit=unit, errors="raise")
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(np.array([1.5]), unit=unit, errors="raise")
         with pytest.raises(ValueError, match=msg):
             with tm.assert_produces_warning(FutureWarning, match=warn_msg):
                 to_datetime(["1.5"], unit=unit, errors="raise")
@@ -2030,10 +2030,14 @@ def test_unit_mixed(self, cache, arr):
     def test_unit_rounding(self, cache):
         # GH 14156 & GH 20445: argument will incur floating point errors
         # but no premature rounding
-        result = to_datetime(1434743731.8770001, unit="s", cache=cache)
-        expected = Timestamp("2015-06-19 19:55:31.877000192")
+        value = 1434743731.8770001
+        result = to_datetime(value, unit="s", cache=cache)
+        expected = Timestamp("2015-06-19 19:55:31.877000093")
         assert result == expected
 
+        alt = Timestamp(value, unit="s")
+        assert alt == result
+
     def test_unit_ignore_keeps_name(self, cache):
         # GH 21697
         expected = Index([15e9] * 2, name="name")
diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import IS64
 from pandas.errors import OutOfBoundsTimedelta
 
 import pandas as pd
@@ -232,6 +233,7 @@ def test_to_timedelta_on_missing_values_list(self, val):
         actual = to_timedelta([val])
         assert actual[0]._value == np.timedelta64("NaT").astype("int64")
 
+    @pytest.mark.xfail(not IS64, reason="Floating point error")
     def test_to_timedelta_float(self):
         # https://github.com/pandas-dev/pandas/issues/25077
         arr = np.arange(0, 1, 1e-6)[-10:]

Original file line number	Diff line number	Diff line change
`@@ -461,6 +461,7 @@ Datetimelike`
`461`	`461`	- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
`462`	`462`	- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`)
`463`	`463`	- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`)
	`464`	+- Bug in the results of :func:`pd.to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`)
`464`	`465`	`-`
`465`	`466`
`466`	`467`	`Timedelta`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1`
`45`	`45`
`46`	`46`	`cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)`
`47`	`47`	`cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1`
`48`		`-cpdef (int64_t, int) precision_from_unit(`
	`48`	`+cdef (int64_t, int) precision_from_unit(`
`49`	`49`	`NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=*`
`50`	`50`	`)`
`51`	`51`