-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Implement DatetimeArray._from_sequence #24074
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
be745aa
d85aa7a
ad88a79
d8f8d85
e94cfff
96c8119
dbb9677
6ce2528
9d7cb39
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,9 +14,9 @@ | |
from pandas.util._decorators import Appender, cache_readonly | ||
|
||
from pandas.core.dtypes.common import ( | ||
_NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, is_extension_type, | ||
is_float_dtype, is_int64_dtype, is_object_dtype, is_period_dtype, | ||
is_timedelta64_dtype) | ||
_INT64_DTYPE, _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, | ||
is_dtype_equal, is_extension_type, is_float_dtype, is_int64_dtype, | ||
is_object_dtype, is_period_dtype, is_string_dtype, is_timedelta64_dtype) | ||
from pandas.core.dtypes.dtypes import DatetimeTZDtype | ||
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries | ||
from pandas.core.dtypes.missing import isna | ||
|
@@ -206,45 +206,103 @@ def _simple_new(cls, values, freq=None, tz=None): | |
result._tz = timezones.tz_standardize(tz) | ||
return result | ||
|
||
def __new__(cls, values, freq=None, tz=None, dtype=None): | ||
def __new__(cls, values, freq=None, tz=None, dtype=None, copy=False, | ||
dayfirst=False, yearfirst=False, ambiguous='raise'): | ||
return cls._from_sequence( | ||
values, freq=freq, tz=tz, dtype=dtype, copy=copy, | ||
dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) | ||
|
||
if freq is None and hasattr(values, "freq"): | ||
# i.e. DatetimeArray, DatetimeIndex | ||
freq = values.freq | ||
@classmethod | ||
def _from_sequence(cls, data, dtype=None, copy=False, | ||
tz=None, freq=None, | ||
dayfirst=False, yearfirst=False, ambiguous='raise'): | ||
|
||
verify_integrity = True | ||
|
||
freq, freq_infer = dtl.maybe_infer_freq(freq) | ||
if freq is None and hasattr(data, "freq"): | ||
# i.e. DatetimeArray/Index | ||
freq = data.freq | ||
verify_integrity = False | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# if dtype has an embedded tz, capture it | ||
tz = dtl.validate_tz_from_dtype(dtype, tz) | ||
|
||
if is_object_dtype(values): | ||
# kludge; dispatch until the DatetimeArray constructor is complete | ||
from pandas import DatetimeIndex | ||
values = DatetimeIndex(values, freq=freq, tz=tz) | ||
|
||
if isinstance(values, ABCSeries): | ||
# extract to ndarray or DatetimeIndex | ||
values = values._values | ||
|
||
if isinstance(values, DatetimeArrayMixin): | ||
# extract nanosecond unix timestamps | ||
if tz is None: | ||
tz = values.tz | ||
values = values.asi8 | ||
|
||
if values.dtype == 'i8': | ||
values = values.view('M8[ns]') | ||
if not hasattr(data, "dtype"): | ||
# e.g. list, tuple | ||
if np.ndim(data) == 0: | ||
# i.e. generator | ||
data = list(data) | ||
data = np.asarray(data) | ||
copy = False | ||
elif isinstance(data, ABCSeries): | ||
data = data._values | ||
|
||
# By this point we are assured to have either a numpy array or Index | ||
data, copy = maybe_convert_dtype(data, copy) | ||
|
||
if is_object_dtype(data) or is_string_dtype(data): | ||
# TODO: We do not have tests specific to string-dtypes, | ||
# also complex or categorical or other extension | ||
copy = False | ||
if lib.infer_dtype(data) == 'integer': | ||
data = data.astype(np.int64) | ||
else: | ||
# data comes back here as either i8 to denote UTC timestamps | ||
# or M8[ns] to denote wall times | ||
data, inferred_tz = objects_to_datetime64ns( | ||
data, dayfirst=dayfirst, yearfirst=yearfirst) | ||
tz = maybe_infer_tz(tz, inferred_tz) | ||
|
||
if is_datetime64tz_dtype(data): | ||
tz = maybe_infer_tz(tz, data.tz) | ||
subarr = data._data | ||
|
||
elif is_datetime64_dtype(data): | ||
# tz-naive DatetimeArray/Index or ndarray[datetime64] | ||
data = getattr(data, "_data", data) | ||
if data.dtype != _NS_DTYPE: | ||
data = conversion.ensure_datetime64ns(data) | ||
|
||
if tz is not None: | ||
# Convert tz-naive to UTC | ||
tz = timezones.maybe_get_tz(tz) | ||
data = conversion.tz_localize_to_utc(data.view('i8'), tz, | ||
ambiguous=ambiguous) | ||
data = data.view(_NS_DTYPE) | ||
|
||
assert data.dtype == _NS_DTYPE, data.dtype | ||
subarr = data | ||
|
||
assert isinstance(values, np.ndarray), type(values) | ||
assert is_datetime64_dtype(values) # not yet assured nanosecond | ||
values = conversion.ensure_datetime64ns(values, copy=False) | ||
else: | ||
# must be integer dtype otherwise | ||
# assume this data are epoch timestamps | ||
if data.dtype != _INT64_DTYPE: | ||
data = data.astype(np.int64, copy=False) | ||
subarr = data.view(_NS_DTYPE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be able to pull out There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's what I tried originally, but it breaks the recently-implemented tests.arrays.test_datetimelike.test_from_array_keeps_base (#23956) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hrm sorry. FWIW, that was a hackish thing that I introduced to avoid segfaults. If the code in reduction code was doing the right thing, we may be able to remove that test / requirement. But I'm not sure what the right thing is. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the purposes of this PR it sounds like the options are to either remove that test and un-indent t his line, or keep this line how it is. It sounds like the first option would cause problems in your DTA branch. I don't have a strong preference here. |
||
|
||
if copy: | ||
# TODO: should this be deepcopy? | ||
subarr = subarr.copy() | ||
|
||
assert isinstance(subarr, np.ndarray), type(subarr) | ||
assert subarr.dtype == 'M8[ns]', subarr.dtype | ||
|
||
result = cls._simple_new(subarr, freq=freq, tz=tz) | ||
if dtype is not None: | ||
# TODO: can we work this into validate_tz_from_dtype? | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not is_dtype_equal(result.dtype, dtype): | ||
# dtype must be coerced to DatetimeTZDtype above | ||
if result.tz is not None: | ||
raise ValueError("cannot localize from non-UTC data") | ||
|
||
if verify_integrity and len(result) > 0: | ||
if freq is not None and not freq_infer: | ||
cls._validate_frequency(result, freq, ambiguous=ambiguous) | ||
|
||
result = cls._simple_new(values, freq=freq, tz=tz) | ||
if freq_infer: | ||
result.freq = to_offset(result.inferred_freq) | ||
|
||
# NB: Among other things not yet ported from the DatetimeIndex | ||
# constructor, this does not call _deepcopy_if_needed | ||
return result | ||
|
||
@classmethod | ||
|
@@ -1494,7 +1552,7 @@ def maybe_convert_dtype(data, copy): | |
elif is_timedelta64_dtype(data): | ||
warnings.warn("Passing timedelta64-dtype data is deprecated, will " | ||
"raise a TypeError in a future version", | ||
FutureWarning, stacklevel=3) | ||
FutureWarning, stacklevel=4) | ||
data = data.view(_NS_DTYPE) | ||
|
||
elif is_period_dtype(data): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,24 +9,19 @@ | |
|
||
from pandas._libs import ( | ||
Timestamp, index as libindex, join as libjoin, lib, tslib as libts) | ||
from pandas._libs.tslibs import ( | ||
ccalendar, conversion, fields, parsing, timezones) | ||
from pandas._libs.tslibs import ccalendar, fields, parsing, timezones | ||
import pandas.compat as compat | ||
from pandas.util._decorators import Appender, Substitution, cache_readonly | ||
|
||
from pandas.core.dtypes.common import ( | ||
_INT64_DTYPE, _NS_DTYPE, ensure_int64, is_datetime64_dtype, | ||
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, is_float, | ||
is_integer, is_list_like, is_object_dtype, is_period_dtype, is_scalar, | ||
is_string_dtype, is_string_like, pandas_dtype) | ||
_NS_DTYPE, ensure_int64, is_datetime64_ns_dtype, is_dtype_equal, is_float, | ||
is_integer, is_list_like, is_period_dtype, is_scalar, is_string_like, | ||
pandas_dtype) | ||
import pandas.core.dtypes.concat as _concat | ||
from pandas.core.dtypes.generic import ABCSeries | ||
from pandas.core.dtypes.missing import isna | ||
|
||
from pandas.core.arrays import datetimelike as dtl | ||
from pandas.core.arrays.datetimes import ( | ||
DatetimeArrayMixin as DatetimeArray, _to_m8, maybe_convert_dtype, | ||
maybe_infer_tz, objects_to_datetime64ns) | ||
DatetimeArrayMixin as DatetimeArray, _to_m8) | ||
from pandas.core.base import _shared_docs | ||
import pandas.core.common as com | ||
from pandas.core.indexes.base import Index, _index_shared_docs | ||
|
@@ -49,12 +44,17 @@ def _new_DatetimeIndex(cls, d): | |
# so need to localize | ||
tz = d.pop('tz', None) | ||
|
||
with warnings.catch_warnings(): | ||
# we ignore warnings from passing verify_integrity=False | ||
# TODO: If we knew what was going in to **d, we might be able to | ||
# go through _simple_new instead | ||
warnings.simplefilter("ignore") | ||
result = cls.__new__(cls, verify_integrity=False, **d) | ||
if "data" in d and not isinstance(d["data"], DatetimeIndex): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you explain the reasoning behind this change? I don't see why just DatetimeIndex would need to have the integrity verified. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without this we get a couple of failing pickle-based tests. tests.frame.test_block_internals.test_pickle and tests.series.test_timeseries.test_pickle There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gotcha, I've been fighting those too errors too. I suspect they were working before because DatetimeIndex accepts There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is because you need to define There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It isn't. The issue is that this function calls There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fixed by #24096 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's fix that one first then. this needs to be changed here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Pickling turns out to be only tangentially related to the "real" problem. In the status quo, altering a datetime64tz column alters the DatetimeIndex that backs it, but doesn't set its So the goal of #24096 is to not corrupt the DatetimeIndex in the first place, making That's still pretty roundabout. Any clearer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that does help. |
||
# Avoid need to verify integrity by calling simple_new directly | ||
data = d.pop("data") | ||
result = cls._simple_new(data, **d) | ||
else: | ||
with warnings.catch_warnings(): | ||
# we ignore warnings from passing verify_integrity=False | ||
# TODO: If we knew what was going in to **d, we might be able to | ||
# go through _simple_new instead | ||
warnings.simplefilter("ignore") | ||
result = cls.__new__(cls, verify_integrity=False, **d) | ||
|
||
if tz is not None: | ||
result = result.tz_localize('UTC').tz_convert(tz) | ||
|
@@ -260,81 +260,12 @@ def __new__(cls, data=None, | |
if name is None and hasattr(data, 'name'): | ||
name = data.name | ||
|
||
freq, freq_infer = dtl.maybe_infer_freq(freq) | ||
if freq is None and hasattr(data, "freq"): | ||
# i.e. DatetimeArray/Index | ||
freq = data.freq | ||
verify_integrity = False | ||
|
||
# if dtype has an embedded tz, capture it | ||
tz = dtl.validate_tz_from_dtype(dtype, tz) | ||
|
||
if not hasattr(data, "dtype"): | ||
# e.g. list, tuple | ||
if np.ndim(data) == 0: | ||
# i.e. generator | ||
data = list(data) | ||
data = np.asarray(data) | ||
copy = False | ||
elif isinstance(data, ABCSeries): | ||
data = data._values | ||
|
||
# By this point we are assured to have either a numpy array or Index | ||
data, copy = maybe_convert_dtype(data, copy) | ||
|
||
if is_object_dtype(data) or is_string_dtype(data): | ||
# TODO: We do not have tests specific to string-dtypes, | ||
# also complex or categorical or other extension | ||
copy = False | ||
if lib.infer_dtype(data) == 'integer': | ||
data = data.astype(np.int64) | ||
else: | ||
# data comes back here as either i8 to denote UTC timestamps | ||
# or M8[ns] to denote wall times | ||
data, inferred_tz = objects_to_datetime64ns( | ||
data, dayfirst=dayfirst, yearfirst=yearfirst) | ||
tz = maybe_infer_tz(tz, inferred_tz) | ||
|
||
if is_datetime64tz_dtype(data): | ||
tz = maybe_infer_tz(tz, data.tz) | ||
subarr = data._data | ||
|
||
elif is_datetime64_dtype(data): | ||
# tz-naive DatetimeArray/Index or ndarray[datetime64] | ||
data = getattr(data, "_data", data) | ||
if data.dtype != _NS_DTYPE: | ||
data = conversion.ensure_datetime64ns(data) | ||
|
||
if tz is not None: | ||
# Convert tz-naive to UTC | ||
tz = timezones.maybe_get_tz(tz) | ||
data = conversion.tz_localize_to_utc(data.view('i8'), tz, | ||
ambiguous=ambiguous) | ||
subarr = data.view(_NS_DTYPE) | ||
dtarr = DatetimeArray._from_sequence( | ||
data, dtype=dtype, copy=copy, tz=tz, freq=freq, | ||
dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) | ||
|
||
else: | ||
# must be integer dtype otherwise | ||
# assume this data are epoch timestamps | ||
if data.dtype != _INT64_DTYPE: | ||
data = data.astype(np.int64, copy=False) | ||
subarr = data.view(_NS_DTYPE) | ||
|
||
assert isinstance(subarr, np.ndarray), type(subarr) | ||
assert subarr.dtype == 'M8[ns]', subarr.dtype | ||
|
||
subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) | ||
if dtype is not None: | ||
if not is_dtype_equal(subarr.dtype, dtype): | ||
# dtype must be coerced to DatetimeTZDtype above | ||
if subarr.tz is not None: | ||
raise ValueError("cannot localize from non-UTC data") | ||
|
||
if verify_integrity and len(subarr) > 0: | ||
if freq is not None and not freq_infer: | ||
cls._validate_frequency(subarr, freq, ambiguous=ambiguous) | ||
|
||
if freq_infer: | ||
subarr.freq = to_offset(subarr.inferred_freq) | ||
subarr = cls._simple_new(dtarr._data, name=name, | ||
freq=dtarr.freq, tz=dtarr.tz) | ||
|
||
return subarr._deepcopy_if_needed(ref_to_data, copy) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason you don't want to add
verify_integrity
here (as maybe_verify_integrity=True
)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We've deprecated the kwarg in the DatetimeIndex constructor to get rid of it. In cases where verify_integrity is not needed, a different constructor (e.g. simple_new) should be used.