Skip to content

API: retain non-nano timedelta64 dtype in DataFrame/Series/Index constructors #49014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ Other API changes
- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
- Passing data with dtype of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; timedelta64 data with lower resolution will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`)
- Passing ``dtype`` of "timedelta64[s]", "timedelta64[ms]", or "timedelta64[us]" to :class:`TimedeltaIndex`, :class:`Series`, or :class:`DataFrame` constructors will now retain that dtype instead of casting to "timedelta64[ns]"; passing a dtype with lower resolution for :class:`Series` or :class:`DataFrame` will be cast to the lowest supported resolution "timedelta64[s]" (:issue:`49014`)
- Passing a ``np.datetime64`` object with non-nanosecond resolution to :class:`Timestamp` will retain the input resolution if it is "s", "ms", or "ns"; otherwise it will be cast to the closest supported resolution (:issue:`49008`)
-

Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/tslibs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,17 @@
"periods_per_day",
"periods_per_second",
"is_supported_unit",
"npy_unit_to_abbrev",
"get_supported_reso",
]

from pandas._libs.tslibs import dtypes
from pandas._libs.tslibs.conversion import localize_pydatetime
from pandas._libs.tslibs.dtypes import (
Resolution,
get_supported_reso,
is_supported_unit,
npy_unit_to_abbrev,
periods_per_day,
periods_per_second,
)
Expand Down
38 changes: 27 additions & 11 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
Tick,
Timedelta,
astype_overflowsafe,
get_supported_reso,
get_unit_from_dtype,
iNaT,
is_supported_unit,
npy_unit_to_abbrev,
periods_per_second,
to_offset,
)
Expand Down Expand Up @@ -197,28 +199,29 @@ def _simple_new( # type: ignore[override]
return result

@classmethod
def _from_sequence(
cls, data, *, dtype=TD64NS_DTYPE, copy: bool = False
) -> TimedeltaArray:
def _from_sequence(cls, data, *, dtype=None, copy: bool = False) -> TimedeltaArray:
if dtype:
_validate_td64_dtype(dtype)
dtype = _validate_td64_dtype(dtype)

data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None)
freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False)

if dtype is not None:
data = astype_overflowsafe(data, dtype=dtype, copy=False)

return cls._simple_new(data, dtype=data.dtype, freq=freq)

@classmethod
def _from_sequence_not_strict(
cls,
data,
dtype=TD64NS_DTYPE,
dtype=None,
copy: bool = False,
freq=lib.no_default,
unit=None,
) -> TimedeltaArray:
if dtype:
_validate_td64_dtype(dtype)
dtype = _validate_td64_dtype(dtype)

assert unit not in ["Y", "y", "M"] # caller is responsible for checking

Expand All @@ -232,6 +235,9 @@ def _from_sequence_not_strict(
if explicit_none:
freq = None

if dtype is not None:
data = astype_overflowsafe(data, dtype=dtype, copy=False)

result = cls._simple_new(data, dtype=data.dtype, freq=freq)

if inferred_freq is None and freq is not None:
Expand Down Expand Up @@ -944,9 +950,13 @@ def sequence_to_td64ns(
copy = False

elif is_timedelta64_dtype(data.dtype):
if data.dtype != TD64NS_DTYPE:
# non-nano unit
data = astype_overflowsafe(data, dtype=TD64NS_DTYPE)
data_unit = get_unit_from_dtype(data.dtype)
if not is_supported_unit(data_unit):
# cast to closest supported unit, i.e. s or ns
new_reso = get_supported_reso(data_unit)
new_unit = npy_unit_to_abbrev(new_reso)
new_dtype = np.dtype(f"m8[{new_unit}]")
data = astype_overflowsafe(data, dtype=new_dtype, copy=False)
copy = False

else:
Expand All @@ -955,7 +965,9 @@ def sequence_to_td64ns(

data = np.array(data, copy=copy)

assert data.dtype == "m8[ns]", data
assert data.dtype.kind == "m"
assert data.dtype != "m8" # i.e. not unit-less

return data, inferred_freq


Expand Down Expand Up @@ -1045,7 +1057,11 @@ def _validate_td64_dtype(dtype) -> DtypeObj:
)
raise ValueError(msg)

if not is_dtype_equal(dtype, TD64NS_DTYPE):
if (
not isinstance(dtype, np.dtype)
or dtype.kind != "m"
or not is_supported_unit(get_unit_from_dtype(dtype))
):
raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]")

return dtype
21 changes: 16 additions & 5 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
Timedelta,
Timestamp,
astype_overflowsafe,
get_supported_reso,
get_unit_from_dtype,
is_supported_unit,
npy_unit_to_abbrev,
)
from pandas._libs.tslibs.timedeltas import array_to_timedelta64
from pandas._typing import (
Expand Down Expand Up @@ -1456,8 +1460,11 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
"""
Convert dtypes with granularity less than nanosecond to nanosecond

>>> _ensure_nanosecond_dtype(np.dtype("M8[s]"))
dtype('<M8[ns]')
>>> _ensure_nanosecond_dtype(np.dtype("M8[D]"))
dtype('<M8[s]')

>>> _ensure_nanosecond_dtype(np.dtype("M8[us]"))
dtype('<M8[us]')

>>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
Traceback (most recent call last):
Expand All @@ -1476,13 +1483,15 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
# i.e. datetime64tz
pass

elif dtype.kind == "M" and dtype != DT64NS_DTYPE:
elif dtype.kind == "M" and not is_supported_unit(get_unit_from_dtype(dtype)):
# pandas supports dtype whose granularity is less than [ns]
# e.g., [ps], [fs], [as]
if dtype <= np.dtype("M8[ns]"):
if dtype.name == "datetime64":
raise ValueError(msg)
dtype = DT64NS_DTYPE
reso = get_supported_reso(get_unit_from_dtype(dtype))
unit = npy_unit_to_abbrev(reso)
dtype = np.dtype(f"M8[{unit}]")
else:
raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]")

Expand All @@ -1492,7 +1501,9 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj:
if dtype <= np.dtype("m8[ns]"):
if dtype.name == "timedelta64":
raise ValueError(msg)
dtype = TD64NS_DTYPE
reso = get_supported_reso(get_unit_from_dtype(dtype))
unit = npy_unit_to_abbrev(reso)
dtype = np.dtype(f"m8[{unit}]")
else:
raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]")
return dtype
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from pandas._typing import DtypeObj

from pandas.core.dtypes.common import (
TD64NS_DTYPE,
is_scalar,
is_timedelta64_dtype,
)
Expand Down Expand Up @@ -121,7 +120,7 @@ def __new__(
unit=None,
freq=lib.no_default,
closed=None,
dtype=TD64NS_DTYPE,
dtype=None,
copy: bool = False,
name=None,
):
Expand Down
17 changes: 10 additions & 7 deletions pandas/tests/arithmetic/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,11 @@ def test_mul_td64arr(self, left, box_cls):
right = np.array([1, 2, 3], dtype="m8[s]")
right = box_cls(right)

expected = TimedeltaIndex(["10s", "40s", "90s"])
expected = TimedeltaIndex(["10s", "40s", "90s"], dtype=right.dtype)

if isinstance(left, Series) or box_cls is Series:
expected = Series(expected)
assert expected.dtype == right.dtype

result = left * right
tm.assert_equal(result, expected)
Expand All @@ -171,9 +173,10 @@ def test_div_td64arr(self, left, box_cls):
right = np.array([10, 40, 90], dtype="m8[s]")
right = box_cls(right)

expected = TimedeltaIndex(["1s", "2s", "3s"])
expected = TimedeltaIndex(["1s", "2s", "3s"], dtype=right.dtype)
if isinstance(left, Series) or box_cls is Series:
expected = Series(expected)
assert expected.dtype == right.dtype

result = right / left
tm.assert_equal(result, expected)
Expand Down Expand Up @@ -206,12 +209,12 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array):
box = box_with_array
index = numeric_idx
expected = TimedeltaIndex([Timedelta(days=n) for n in range(len(index))])
if isinstance(scalar_td, np.timedelta64) and box not in [Index, Series]:
if isinstance(scalar_td, np.timedelta64):
# TODO(2.0): once TDA.astype converts to m8, just do expected.astype
tda = expected._data
dtype = scalar_td.dtype
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
elif type(scalar_td) is timedelta and box not in [Index, Series]:
elif type(scalar_td) is timedelta:
# TODO(2.0): once TDA.astype converts to m8, just do expected.astype
tda = expected._data
dtype = np.dtype("m8[us]")
Expand Down Expand Up @@ -247,7 +250,7 @@ def test_numeric_arr_mul_tdscalar_numexpr_path(
obj = tm.box_expected(arr, box, transpose=False)

expected = arr_i8.view("timedelta64[D]").astype("timedelta64[ns]")
if type(scalar_td) is timedelta and box is array:
if type(scalar_td) is timedelta:
# TODO(2.0): this shouldn't depend on 'box'
expected = expected.astype("timedelta64[us]")
# TODO(2.0): won't be necessary to construct TimedeltaArray
Expand All @@ -268,15 +271,15 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array
index = numeric_idx[1:3]

expected = TimedeltaIndex(["3 Days", "36 Hours"])
if isinstance(three_days, np.timedelta64) and box not in [Index, Series]:
if isinstance(three_days, np.timedelta64):
# TODO(2.0): just use expected.astype
tda = expected._data
dtype = three_days.dtype
if dtype < np.dtype("m8[s]"):
# i.e. resolution is lower -> use lowest supported resolution
dtype = np.dtype("m8[s]")
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
elif type(three_days) is timedelta and box not in [Index, Series]:
elif type(three_days) is timedelta:
# TODO(2.0): just use expected.astype
tda = expected._data
dtype = np.dtype("m8[us]")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def test_array_copy():
),
(
np.array([1, 2], dtype="m8[us]"),
TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")),
TimedeltaArray(np.array([1, 2], dtype="m8[us]")),
),
# integer
([1, 2], IntegerArray._from_sequence([1, 2])),
Expand Down
16 changes: 11 additions & 5 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,15 +484,21 @@ def test_astype_to_timedelta_unit(self, unit):
dtype = f"m8[{unit}]"
arr = np.array([[1, 2, 3]], dtype=dtype)
df = DataFrame(arr)
result = df.astype(dtype)
if unit in ["us", "ms", "s"]:
assert (df.dtypes == dtype).all()
else:
# We get the nearest supported unit, i.e. "s"
assert (df.dtypes == "m8[s]").all()

result = df.astype(dtype)
if unit in ["m", "h", "D"]:
# We don't support these, so we use the old logic to convert to float
# We don't support these, so we use the pre-2.0 logic to convert to float
# (xref GH#48979)

expected = DataFrame(df.values.astype(dtype).astype(float))
else:
tda = pd.core.arrays.TimedeltaArray._simple_new(arr, dtype=arr.dtype)
expected = DataFrame(tda)
assert (expected.dtypes == dtype).all()
# The conversion is a no-op, so we just get a copy
expected = df

tm.assert_frame_equal(result, expected)

Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,15 @@ def test_construction_with_mixed(self, float_string_frame):

def test_construction_with_conversions(self):

# convert from a numpy array of non-ns timedelta64
# convert from a numpy array of non-ns timedelta64; as of 2.0 this does
# *not* convert
arr = np.array([1, 2, 3], dtype="timedelta64[s]")
df = DataFrame(index=range(3))
df["A"] = arr
expected = DataFrame(
{"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
)
tm.assert_frame_equal(df, expected)
tm.assert_numpy_array_equal(df["A"].to_numpy(), arr)

expected = DataFrame(
{
Expand Down
30 changes: 20 additions & 10 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2075,18 +2075,19 @@ def test_constructor_datetimes_non_ns(self, order, dtype):

@pytest.mark.parametrize("order", ["K", "A", "C", "F"])
@pytest.mark.parametrize(
"dtype",
"unit",
[
"timedelta64[D]",
"timedelta64[h]",
"timedelta64[m]",
"timedelta64[s]",
"timedelta64[ms]",
"timedelta64[us]",
"timedelta64[ns]",
"D",
"h",
"m",
"s",
"ms",
"us",
"ns",
],
)
def test_constructor_timedelta_non_ns(self, order, dtype):
def test_constructor_timedelta_non_ns(self, order, unit):
dtype = f"timedelta64[{unit}]"
na = np.array(
[
[np.timedelta64(1, "D"), np.timedelta64(2, "D")],
Expand All @@ -2095,13 +2096,22 @@ def test_constructor_timedelta_non_ns(self, order, dtype):
dtype=dtype,
order=order,
)
df = DataFrame(na).astype("timedelta64[ns]")
df = DataFrame(na)
if unit in ["D", "h", "m"]:
# we get the nearest supported unit, i.e. "s"
exp_unit = "s"
else:
exp_unit = unit
exp_dtype = np.dtype(f"m8[{exp_unit}]")
expected = DataFrame(
[
[Timedelta(1, "D"), Timedelta(2, "D")],
[Timedelta(4, "D"), Timedelta(5, "D")],
],
dtype=exp_dtype,
)
# TODO(2.0): ideally we should get the same 'expected' without passing
# dtype=exp_dtype.
tm.assert_frame_equal(df, expected)

def test_constructor_for_list_with_dtypes(self):
Expand Down
Loading