BUG: DatetimeIndex with non-nano dtype and mixed numeric inputs (#56004)

jbrockmendel · mroeschke · web-flow · commit be81303bc301 · 2023-11-17T10:34:37.000-08:00
* BUG: DatetimeIndex with non-nano dtype and mixed numeric inputs

* revert

* update astype test

* fix test on 32bit

* Update doc/source/whatsnew/v2.2.0.rst

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* Update doc/source/whatsnew/v2.2.0.rst

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

---------

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -356,6 +356,7 @@ Datetimelike
 - Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
 - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
 - Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
+- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
 -
 
 Timedelta
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -530,7 +530,9 @@ cpdef array_to_datetime(
                     state.update_creso(item_reso)
                     if infer_reso:
                         creso = state.creso
-                    iresult[i] = cast_from_unit(val, "ns", out_reso=creso)
+
+                    # we now need to parse this as if unit=abbrev
+                    iresult[i] = cast_from_unit(val, abbrev, out_reso=creso)
                     state.found_other = True
 
             elif isinstance(val, str):
@@ -779,6 +781,13 @@ def array_to_datetime_with_tz(
         _TSObject tsobj
         bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
         DatetimeParseState state = DatetimeParseState(creso)
+        str abbrev
+
+    if infer_reso:
+        # We treat ints/floats as nanoseconds
+        abbrev = "ns"
+    else:
+        abbrev = npy_unit_to_abbrev(creso)
 
     for i in range(n):
         # Analogous to `item = values[i]`
@@ -790,7 +799,12 @@ def array_to_datetime_with_tz(
 
         else:
             tsobj = convert_to_tsobject(
-                item, tz=tz, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst, nanos=0
+                item,
+                tz=tz,
+                unit=abbrev,
+                dayfirst=dayfirst,
+                yearfirst=yearfirst,
+                nanos=0,
             )
             if tsobj.value != NPY_NAT:
                 state.update_creso(tsobj.creso)
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2220,6 +2220,7 @@ def _sequence_to_dt64(
         data = cast(np.ndarray, data)
         copy = False
         if lib.infer_dtype(data, skipna=False) == "integer":
+            # Much more performant than going through array_to_datetime
             data = data.astype(np.int64)
         elif tz is not None and ambiguous == "raise":
             obj_data = np.asarray(data, dtype=object)
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -1012,11 +1012,17 @@ def test_dti_constructor_with_non_nano_dtype(self, tz):
         dtype = "M8[us]"
         if tz is not None:
             dtype = f"M8[us, {tz}]"
-        # NB: the 2500 is interpreted as nanoseconds and rounded *down*
-        # to 2 microseconds
         vals = [ts, "2999-01-02 03:04:05.678910", 2500]
         result = DatetimeIndex(vals, dtype=dtype)
-        exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals]
+        # The 2500 is interpreted as microseconds, consistent with what
+        #  we would get if we created DatetimeIndexes from vals[:2] and vals[2:]
+        #  and concated the results.
+        pointwise = [
+            vals[0].tz_localize(tz),
+            Timestamp(vals[1], tz=tz),
+            to_datetime(vals[2], unit="us", utc=True).tz_convert(tz),
+        ]
+        exp_vals = [x.as_unit("us").asm8 for x in pointwise]
         exp_arr = np.array(exp_vals, dtype="M8[us]")
         expected = DatetimeIndex(exp_arr, dtype="M8[us]")
         if tz is not None:
@@ -1054,6 +1060,36 @@ def test_dti_constructor_object_float_matches_float_dtype(self):
         dti2 = DatetimeIndex(arr2, tz="CET")
         tm.assert_index_equal(dti1, dti2)
 
+    @pytest.mark.parametrize("dtype", ["M8[us]", "M8[us, US/Pacific]"])
+    def test_dti_constructor_with_dtype_object_int_matches_int_dtype(self, dtype):
+        # Going through the object path should match the non-object path
+
+        vals1 = np.arange(5, dtype="i8") * 1000
+        vals1[0] = pd.NaT.value
+
+        vals2 = vals1.astype(np.float64)
+        vals2[0] = np.nan
+
+        vals3 = vals1.astype(object)
+        # change lib.infer_dtype(vals3) from "integer" so we go through
+        #  array_to_datetime in _sequence_to_dt64
+        vals3[0] = pd.NaT
+
+        vals4 = vals2.astype(object)
+
+        res1 = DatetimeIndex(vals1, dtype=dtype)
+        res2 = DatetimeIndex(vals2, dtype=dtype)
+        res3 = DatetimeIndex(vals3, dtype=dtype)
+        res4 = DatetimeIndex(vals4, dtype=dtype)
+
+        expected = DatetimeIndex(vals1.view("M8[us]"))
+        if res1.tz is not None:
+            expected = expected.tz_localize("UTC").tz_convert(res1.tz)
+        tm.assert_index_equal(res1, expected)
+        tm.assert_index_equal(res2, expected)
+        tm.assert_index_equal(res3, expected)
+        tm.assert_index_equal(res4, expected)
+
 
 class TestTimeSeries:
     def test_dti_constructor_preserve_dti_freq(self):
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
@@ -25,6 +25,7 @@
     Timestamp,
     cut,
     date_range,
+    to_datetime,
 )
 import pandas._testing as tm
 
@@ -114,13 +115,19 @@ def test_astype_object_to_dt64_non_nano(self, tz):
         dtype = "M8[us]"
         if tz is not None:
             dtype = f"M8[us, {tz}]"
-        # NB: the 2500 is interpreted as nanoseconds and rounded *down*
-        # to 2 microseconds
         vals = [ts, "2999-01-02 03:04:05.678910", 2500]
         ser = Series(vals, dtype=object)
         result = ser.astype(dtype)
 
-        exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals]
+        # The 2500 is interpreted as microseconds, consistent with what
+        #  we would get if we created DatetimeIndexes from vals[:2] and vals[2:]
+        #  and concated the results.
+        pointwise = [
+            vals[0].tz_localize(tz),
+            Timestamp(vals[1], tz=tz),
+            to_datetime(vals[2], unit="us", utc=True).tz_convert(tz),
+        ]
+        exp_vals = [x.as_unit("us").asm8 for x in pointwise]
         exp_arr = np.array(exp_vals, dtype="M8[us]")
         expected = Series(exp_arr, dtype="M8[us]")
         if tz is not None:

Original file line number	Diff line number	Diff line change
`@@ -356,6 +356,7 @@ Datetimelike`
`356`	`356`	- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
`357`	`357`	- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
`358`	`358`	- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
	`359`	+- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
`359`	`360`	`-`
`360`	`361`
`361`	`362`	`Timedelta`