Skip to content

Commit be81303

Browse files
BUG: DatetimeIndex with non-nano dtype and mixed numeric inputs (#56004)
* BUG: DatetimeIndex with non-nano dtype and mixed numeric inputs * revert * update astype test * fix test on 32bit * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <[email protected]> * Update doc/source/whatsnew/v2.2.0.rst Co-authored-by: Matthew Roeschke <[email protected]> --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 87d3fe4 commit be81303

File tree

5 files changed

+67
-8
lines changed

5 files changed

+67
-8
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ Datetimelike
356356
- Bug in addition or subtraction of very large :class:`Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
357357
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`)
358358
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`)
359+
- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`)
359360
-
360361

361362
Timedelta

pandas/_libs/tslib.pyx

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,9 @@ cpdef array_to_datetime(
530530
state.update_creso(item_reso)
531531
if infer_reso:
532532
creso = state.creso
533-
iresult[i] = cast_from_unit(val, "ns", out_reso=creso)
533+
534+
# we now need to parse this as if unit=abbrev
535+
iresult[i] = cast_from_unit(val, abbrev, out_reso=creso)
534536
state.found_other = True
535537

536538
elif isinstance(val, str):
@@ -779,6 +781,13 @@ def array_to_datetime_with_tz(
779781
_TSObject tsobj
780782
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
781783
DatetimeParseState state = DatetimeParseState(creso)
784+
str abbrev
785+
786+
if infer_reso:
787+
# We treat ints/floats as nanoseconds
788+
abbrev = "ns"
789+
else:
790+
abbrev = npy_unit_to_abbrev(creso)
782791

783792
for i in range(n):
784793
# Analogous to `item = values[i]`
@@ -790,7 +799,12 @@ def array_to_datetime_with_tz(
790799

791800
else:
792801
tsobj = convert_to_tsobject(
793-
item, tz=tz, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst, nanos=0
802+
item,
803+
tz=tz,
804+
unit=abbrev,
805+
dayfirst=dayfirst,
806+
yearfirst=yearfirst,
807+
nanos=0,
794808
)
795809
if tsobj.value != NPY_NAT:
796810
state.update_creso(tsobj.creso)

pandas/core/arrays/datetimes.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2220,6 +2220,7 @@ def _sequence_to_dt64(
22202220
data = cast(np.ndarray, data)
22212221
copy = False
22222222
if lib.infer_dtype(data, skipna=False) == "integer":
2223+
# Much more performant than going through array_to_datetime
22232224
data = data.astype(np.int64)
22242225
elif tz is not None and ambiguous == "raise":
22252226
obj_data = np.asarray(data, dtype=object)

pandas/tests/indexes/datetimes/test_constructors.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,11 +1012,17 @@ def test_dti_constructor_with_non_nano_dtype(self, tz):
10121012
dtype = "M8[us]"
10131013
if tz is not None:
10141014
dtype = f"M8[us, {tz}]"
1015-
# NB: the 2500 is interpreted as nanoseconds and rounded *down*
1016-
# to 2 microseconds
10171015
vals = [ts, "2999-01-02 03:04:05.678910", 2500]
10181016
result = DatetimeIndex(vals, dtype=dtype)
1019-
exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals]
1017+
# The 2500 is interpreted as microseconds, consistent with what
1018+
# we would get if we created DatetimeIndexes from vals[:2] and vals[2:]
1019+
# and concated the results.
1020+
pointwise = [
1021+
vals[0].tz_localize(tz),
1022+
Timestamp(vals[1], tz=tz),
1023+
to_datetime(vals[2], unit="us", utc=True).tz_convert(tz),
1024+
]
1025+
exp_vals = [x.as_unit("us").asm8 for x in pointwise]
10201026
exp_arr = np.array(exp_vals, dtype="M8[us]")
10211027
expected = DatetimeIndex(exp_arr, dtype="M8[us]")
10221028
if tz is not None:
@@ -1054,6 +1060,36 @@ def test_dti_constructor_object_float_matches_float_dtype(self):
10541060
dti2 = DatetimeIndex(arr2, tz="CET")
10551061
tm.assert_index_equal(dti1, dti2)
10561062

1063+
@pytest.mark.parametrize("dtype", ["M8[us]", "M8[us, US/Pacific]"])
1064+
def test_dti_constructor_with_dtype_object_int_matches_int_dtype(self, dtype):
1065+
# Going through the object path should match the non-object path
1066+
1067+
vals1 = np.arange(5, dtype="i8") * 1000
1068+
vals1[0] = pd.NaT.value
1069+
1070+
vals2 = vals1.astype(np.float64)
1071+
vals2[0] = np.nan
1072+
1073+
vals3 = vals1.astype(object)
1074+
# change lib.infer_dtype(vals3) from "integer" so we go through
1075+
# array_to_datetime in _sequence_to_dt64
1076+
vals3[0] = pd.NaT
1077+
1078+
vals4 = vals2.astype(object)
1079+
1080+
res1 = DatetimeIndex(vals1, dtype=dtype)
1081+
res2 = DatetimeIndex(vals2, dtype=dtype)
1082+
res3 = DatetimeIndex(vals3, dtype=dtype)
1083+
res4 = DatetimeIndex(vals4, dtype=dtype)
1084+
1085+
expected = DatetimeIndex(vals1.view("M8[us]"))
1086+
if res1.tz is not None:
1087+
expected = expected.tz_localize("UTC").tz_convert(res1.tz)
1088+
tm.assert_index_equal(res1, expected)
1089+
tm.assert_index_equal(res2, expected)
1090+
tm.assert_index_equal(res3, expected)
1091+
tm.assert_index_equal(res4, expected)
1092+
10571093

10581094
class TestTimeSeries:
10591095
def test_dti_constructor_preserve_dti_freq(self):

pandas/tests/series/methods/test_astype.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
Timestamp,
2626
cut,
2727
date_range,
28+
to_datetime,
2829
)
2930
import pandas._testing as tm
3031

@@ -114,13 +115,19 @@ def test_astype_object_to_dt64_non_nano(self, tz):
114115
dtype = "M8[us]"
115116
if tz is not None:
116117
dtype = f"M8[us, {tz}]"
117-
# NB: the 2500 is interpreted as nanoseconds and rounded *down*
118-
# to 2 microseconds
119118
vals = [ts, "2999-01-02 03:04:05.678910", 2500]
120119
ser = Series(vals, dtype=object)
121120
result = ser.astype(dtype)
122121

123-
exp_vals = [Timestamp(x, tz=tz).as_unit("us").asm8 for x in vals]
122+
# The 2500 is interpreted as microseconds, consistent with what
123+
# we would get if we created DatetimeIndexes from vals[:2] and vals[2:]
124+
# and concated the results.
125+
pointwise = [
126+
vals[0].tz_localize(tz),
127+
Timestamp(vals[1], tz=tz),
128+
to_datetime(vals[2], unit="us", utc=True).tz_convert(tz),
129+
]
130+
exp_vals = [x.as_unit("us").asm8 for x in pointwise]
124131
exp_arr = np.array(exp_vals, dtype="M8[us]")
125132
expected = Series(exp_arr, dtype="M8[us]")
126133
if tz is not None:

0 commit comments

Comments
 (0)