Skip to content

Implement DatetimeArray._from_sequence #24074

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 5, 2018
Merged
120 changes: 89 additions & 31 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from pandas.util._decorators import Appender, cache_readonly

from pandas.core.dtypes.common import (
_NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, is_extension_type,
is_float_dtype, is_int64_dtype, is_object_dtype, is_period_dtype,
is_timedelta64_dtype)
_INT64_DTYPE, _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype,
is_dtype_equal, is_extension_type, is_float_dtype, is_int64_dtype,
is_object_dtype, is_period_dtype, is_string_dtype, is_timedelta64_dtype)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
Expand Down Expand Up @@ -206,45 +206,103 @@ def _simple_new(cls, values, freq=None, tz=None):
result._tz = timezones.tz_standardize(tz)
return result

def __new__(cls, values, freq=None, tz=None, dtype=None):
def __new__(cls, values, freq=None, tz=None, dtype=None, copy=False,
dayfirst=False, yearfirst=False, ambiguous='raise'):
return cls._from_sequence(
values, freq=freq, tz=tz, dtype=dtype, copy=copy,
dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous)

if freq is None and hasattr(values, "freq"):
# i.e. DatetimeArray, DatetimeIndex
freq = values.freq
@classmethod
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason you don't want to add verify_integrity here (as maybe _verify_integrity=True)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've deprecated the kwarg in the DatetimeIndex constructor to get rid of it. In cases where verify_integrity is not needed, a different constructor (e.g. simple_new) should be used.

def _from_sequence(cls, data, dtype=None, copy=False,
tz=None, freq=None,
dayfirst=False, yearfirst=False, ambiguous='raise'):

verify_integrity = True

freq, freq_infer = dtl.maybe_infer_freq(freq)
if freq is None and hasattr(data, "freq"):
# i.e. DatetimeArray/Index
freq = data.freq
verify_integrity = False

# if dtype has an embedded tz, capture it
tz = dtl.validate_tz_from_dtype(dtype, tz)

if is_object_dtype(values):
# kludge; dispatch until the DatetimeArray constructor is complete
from pandas import DatetimeIndex
values = DatetimeIndex(values, freq=freq, tz=tz)

if isinstance(values, ABCSeries):
# extract to ndarray or DatetimeIndex
values = values._values

if isinstance(values, DatetimeArrayMixin):
# extract nanosecond unix timestamps
if tz is None:
tz = values.tz
values = values.asi8

if values.dtype == 'i8':
values = values.view('M8[ns]')
if not hasattr(data, "dtype"):
# e.g. list, tuple
if np.ndim(data) == 0:
# i.e. generator
data = list(data)
data = np.asarray(data)
copy = False
elif isinstance(data, ABCSeries):
data = data._values

# By this point we are assured to have either a numpy array or Index
data, copy = maybe_convert_dtype(data, copy)

if is_object_dtype(data) or is_string_dtype(data):
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
if lib.infer_dtype(data) == 'integer':
data = data.astype(np.int64)
else:
# data comes back here as either i8 to denote UTC timestamps
# or M8[ns] to denote wall times
data, inferred_tz = objects_to_datetime64ns(
data, dayfirst=dayfirst, yearfirst=yearfirst)
tz = maybe_infer_tz(tz, inferred_tz)

if is_datetime64tz_dtype(data):
tz = maybe_infer_tz(tz, data.tz)
subarr = data._data

elif is_datetime64_dtype(data):
# tz-naive DatetimeArray/Index or ndarray[datetime64]
data = getattr(data, "_data", data)
if data.dtype != _NS_DTYPE:
data = conversion.ensure_datetime64ns(data)

if tz is not None:
# Convert tz-naive to UTC
tz = timezones.maybe_get_tz(tz)
data = conversion.tz_localize_to_utc(data.view('i8'), tz,
ambiguous=ambiguous)
data = data.view(_NS_DTYPE)

assert data.dtype == _NS_DTYPE, data.dtype
subarr = data

assert isinstance(values, np.ndarray), type(values)
assert is_datetime64_dtype(values) # not yet assured nanosecond
values = conversion.ensure_datetime64ns(values, copy=False)
else:
# must be integer dtype otherwise
# assume this data are epoch timestamps
if data.dtype != _INT64_DTYPE:
data = data.astype(np.int64, copy=False)
subarr = data.view(_NS_DTYPE)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be able to pull out
subarr = data.view(_NS_DTYPE) of the if/else and just do it always (or maybe just do it in _simple_new), but this is for later

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what I tried originally, but it breaks the recently-implemented tests.arrays.test_datetimelike.test_from_array_keeps_base (#23956)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrm sorry. FWIW, that was a hackish thing that I introduced to avoid segfaults. If the code in reduction code was doing the right thing, we may be able to remove that test / requirement. But I'm not sure what the right thing is.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the purposes of this PR it sounds like the options are to either remove that test and un-indent t his line, or keep this line how it is. It sounds like the first option would cause problems in your DTA branch. I don't have a strong preference here.


if copy:
# TODO: should this be deepcopy?
subarr = subarr.copy()

assert isinstance(subarr, np.ndarray), type(subarr)
assert subarr.dtype == 'M8[ns]', subarr.dtype

result = cls._simple_new(subarr, freq=freq, tz=tz)
if dtype is not None:
# TODO: can we work this into validate_tz_from_dtype?
if not is_dtype_equal(result.dtype, dtype):
# dtype must be coerced to DatetimeTZDtype above
if result.tz is not None:
raise ValueError("cannot localize from non-UTC data")

if verify_integrity and len(result) > 0:
if freq is not None and not freq_infer:
cls._validate_frequency(result, freq, ambiguous=ambiguous)

result = cls._simple_new(values, freq=freq, tz=tz)
if freq_infer:
result.freq = to_offset(result.inferred_freq)

# NB: Among other things not yet ported from the DatetimeIndex
# constructor, this does not call _deepcopy_if_needed
return result

@classmethod
Expand Down Expand Up @@ -1494,7 +1552,7 @@ def maybe_convert_dtype(data, copy):
elif is_timedelta64_dtype(data):
warnings.warn("Passing timedelta64-dtype data is deprecated, will "
"raise a TypeError in a future version",
FutureWarning, stacklevel=3)
FutureWarning, stacklevel=4)
data = data.view(_NS_DTYPE)

elif is_period_dtype(data):
Expand Down
111 changes: 21 additions & 90 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,19 @@

from pandas._libs import (
Timestamp, index as libindex, join as libjoin, lib, tslib as libts)
from pandas._libs.tslibs import (
ccalendar, conversion, fields, parsing, timezones)
from pandas._libs.tslibs import ccalendar, fields, parsing, timezones
import pandas.compat as compat
from pandas.util._decorators import Appender, Substitution, cache_readonly

from pandas.core.dtypes.common import (
_INT64_DTYPE, _NS_DTYPE, ensure_int64, is_datetime64_dtype,
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, is_float,
is_integer, is_list_like, is_object_dtype, is_period_dtype, is_scalar,
is_string_dtype, is_string_like, pandas_dtype)
_NS_DTYPE, ensure_int64, is_datetime64_ns_dtype, is_dtype_equal, is_float,
is_integer, is_list_like, is_period_dtype, is_scalar, is_string_like,
pandas_dtype)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.dtypes.missing import isna

from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays.datetimes import (
DatetimeArrayMixin as DatetimeArray, _to_m8, maybe_convert_dtype,
maybe_infer_tz, objects_to_datetime64ns)
DatetimeArrayMixin as DatetimeArray, _to_m8)
from pandas.core.base import _shared_docs
import pandas.core.common as com
from pandas.core.indexes.base import Index, _index_shared_docs
Expand All @@ -49,12 +44,17 @@ def _new_DatetimeIndex(cls, d):
# so need to localize
tz = d.pop('tz', None)

with warnings.catch_warnings():
# we ignore warnings from passing verify_integrity=False
# TODO: If we knew what was going in to **d, we might be able to
# go through _simple_new instead
warnings.simplefilter("ignore")
result = cls.__new__(cls, verify_integrity=False, **d)
if "data" in d and not isinstance(d["data"], DatetimeIndex):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you explain the reasoning behind this change? I don't see why just DatetimeIndex would need to have the integrity verified.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without this we get a couple of failing pickle-based tests. tests.frame.test_block_internals.test_pickle and tests.series.test_timeseries.test_pickle

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha, I've been fighting those too errors too. I suspect they were working before because DatetimeIndex accepts data=None for range-based constructors, which we don't want for the arrays.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is because you need to define __reduce__ on in ExtensionArray. This should never be hit by a non-DTI.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It isn't. The issue is that this function calls DatetimeIndex.__new__ with verify_integrity=False (since it is unpickling a previously-constructed DTI, integrity has presumably already been verified, so we can skip that somewhat-costly step), and the pickle-tested cases raise ValueError because when we try to verify their integrity they fail

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fixed by #24096

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's fix that one first then. this needs to be changed here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes over in #24096 seem to be... different? I don't know how to explain it, but doesn't the fact that we're having to copy data over in #24096 seem disconnected from pickling?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but doesn't the fact that we're having to copy data over in #24096 seem disconnected from pickling?

Pickling turns out to be only tangentially related to the "real" problem. In the status quo, altering a datetime64tz column alters the DatetimeIndex that backs it, but doesn't set its freq to None. When that DataFrame is pickled and then unpickled, it tries to reconstruct that DatetimeIndex, but is passing arguments that should raise ValueError. ATM that gets side-stepped by passing verify_integrity=False.

So the goal of #24096 is to not corrupt the DatetimeIndex in the first place, making verify_integrity=False unnecessary.

That's still pretty roundabout. Any clearer?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that does help.

# Avoid need to verify integrity by calling simple_new directly
data = d.pop("data")
result = cls._simple_new(data, **d)
else:
with warnings.catch_warnings():
# we ignore warnings from passing verify_integrity=False
# TODO: If we knew what was going in to **d, we might be able to
# go through _simple_new instead
warnings.simplefilter("ignore")
result = cls.__new__(cls, verify_integrity=False, **d)

if tz is not None:
result = result.tz_localize('UTC').tz_convert(tz)
Expand Down Expand Up @@ -260,81 +260,12 @@ def __new__(cls, data=None,
if name is None and hasattr(data, 'name'):
name = data.name

freq, freq_infer = dtl.maybe_infer_freq(freq)
if freq is None and hasattr(data, "freq"):
# i.e. DatetimeArray/Index
freq = data.freq
verify_integrity = False

# if dtype has an embedded tz, capture it
tz = dtl.validate_tz_from_dtype(dtype, tz)

if not hasattr(data, "dtype"):
# e.g. list, tuple
if np.ndim(data) == 0:
# i.e. generator
data = list(data)
data = np.asarray(data)
copy = False
elif isinstance(data, ABCSeries):
data = data._values

# By this point we are assured to have either a numpy array or Index
data, copy = maybe_convert_dtype(data, copy)

if is_object_dtype(data) or is_string_dtype(data):
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
if lib.infer_dtype(data) == 'integer':
data = data.astype(np.int64)
else:
# data comes back here as either i8 to denote UTC timestamps
# or M8[ns] to denote wall times
data, inferred_tz = objects_to_datetime64ns(
data, dayfirst=dayfirst, yearfirst=yearfirst)
tz = maybe_infer_tz(tz, inferred_tz)

if is_datetime64tz_dtype(data):
tz = maybe_infer_tz(tz, data.tz)
subarr = data._data

elif is_datetime64_dtype(data):
# tz-naive DatetimeArray/Index or ndarray[datetime64]
data = getattr(data, "_data", data)
if data.dtype != _NS_DTYPE:
data = conversion.ensure_datetime64ns(data)

if tz is not None:
# Convert tz-naive to UTC
tz = timezones.maybe_get_tz(tz)
data = conversion.tz_localize_to_utc(data.view('i8'), tz,
ambiguous=ambiguous)
subarr = data.view(_NS_DTYPE)
dtarr = DatetimeArray._from_sequence(
data, dtype=dtype, copy=copy, tz=tz, freq=freq,
dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous)

else:
# must be integer dtype otherwise
# assume this data are epoch timestamps
if data.dtype != _INT64_DTYPE:
data = data.astype(np.int64, copy=False)
subarr = data.view(_NS_DTYPE)

assert isinstance(subarr, np.ndarray), type(subarr)
assert subarr.dtype == 'M8[ns]', subarr.dtype

subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz)
if dtype is not None:
if not is_dtype_equal(subarr.dtype, dtype):
# dtype must be coerced to DatetimeTZDtype above
if subarr.tz is not None:
raise ValueError("cannot localize from non-UTC data")

if verify_integrity and len(subarr) > 0:
if freq is not None and not freq_infer:
cls._validate_frequency(subarr, freq, ambiguous=ambiguous)

if freq_infer:
subarr.freq = to_offset(subarr.inferred_freq)
subarr = cls._simple_new(dtarr._data, name=name,
freq=dtarr.freq, tz=dtarr.tz)

return subarr._deepcopy_if_needed(ref_to_data, copy)

Expand Down