Skip to content

BUG: Issues with DatetimeTZ values in where and combine_first (#21469 + #21546) #21660

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 2, 2018
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 1 addition & 14 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pandas.compat import long, zip, iteritems, PY36, OrderedDict
from pandas.core.config import get_option
from pandas.core.dtypes.generic import ABCSeries, ABCIndex
from pandas.core.dtypes.common import _NS_DTYPE, is_integer
from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.inference import _iterable_not_string
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
Expand Down Expand Up @@ -410,19 +410,6 @@ def _apply_if_callable(maybe_callable, obj, **kwargs):
return maybe_callable


def _where_compat(mask, arr1, arr2):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE:
new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8'))
return new_vals.view(_NS_DTYPE)

if arr1.dtype == _NS_DTYPE:
arr1 = tslib.ints_to_pydatetime(arr1.view('i8'))
if arr2.dtype == _NS_DTYPE:
arr2 = tslib.ints_to_pydatetime(arr2.view('i8'))

return np.where(mask, arr1, arr2)


def _dict_compat(d):
"""
Helper function to convert datetimelike-keyed dicts to Timestamp-keyed dict
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1476,14 +1476,16 @@ def where(self, other, cond, align=True, errors='raise',
if transpose:
values = values.T

other = getattr(other, 'values', other)
other = getattr(other, '_values', getattr(other, 'values', other))
cond = getattr(cond, 'values', cond)

# If the default broadcasting would go in the wrong direction, then
# explicitly reshape other instead
if getattr(other, 'ndim', 0) >= 1:
if values.ndim - 1 == other.ndim and axis == 1:
other = other.reshape(tuple(other.shape + (1, )))
elif transpose and values.ndim == self.ndim - 1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we really need this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without lines 1487-1488 cond and values have opposite dimension, which is the root cause of (the DataFrame part of) #21544.

The issue is DatetimeTZBlock._try_coerce_args returns values as a 2-D array. I've updated _try_coerce_args so that other also is a 2-D array for consistency, which fixes part of the issue.

However, we don't run _try_coerce_args until the interior of func at 1499. Thus lines 1476-1477 (values = values.T) is a no-op given a DatetimeTZBlock (since at that point values is 1-D). other._values is also a 1-D DatetimeIndex. So to make cond consistent with the values and other we get out of _try_coerce_args it needs to be transposed.

An alternative would be changing `DatetimeTZBlock._try_coerce_args to return 1-D arrays for values and values_mask (changing 2875-2877). But if I do that it breaks setitem.

cond = cond.T

if not hasattr(cond, 'shape'):
raise ValueError("where must have a condition that is ndarray "
Expand Down Expand Up @@ -2888,8 +2890,8 @@ def _try_coerce_args(self, values, other):
elif isinstance(other, self._holder):
if other.tz != self.values.tz:
raise ValueError("incompatible or non tz-aware value")
other = other.asi8
other_mask = isna(other)
other_mask = _block_shape(isna(other), ndim=self.ndim)
other = _block_shape(other.asi8, ndim=self.ndim)
elif isinstance(other, (np.datetime64, datetime, date)):
other = tslib.Timestamp(other)
tz = getattr(other, 'tz', None)
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
is_float_dtype,
is_extension_type,
is_extension_array_dtype,
is_datetimelike,
is_datetime64tz_dtype,
is_timedelta64_dtype,
is_object_dtype,
Expand Down Expand Up @@ -78,6 +79,7 @@
from pandas._libs import index as libindex, tslib as libts, lib, iNaT
from pandas.core.config import get_option
from pandas.core.strings import StringMethods
from pandas.core.tools.datetimes import to_datetime

import pandas.plotting._core as gfx

Expand Down Expand Up @@ -2303,10 +2305,12 @@ def combine_first(self, other):
new_index = self.index.union(other.index)
this = self.reindex(new_index, copy=False)
other = other.reindex(new_index, copy=False)
if is_datetimelike(this) and not is_datetimelike(other):
other = to_datetime(other)

# TODO: do we need name?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this comment still relevant?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment no longer relevant, I'll remove.

What do you think of datetime coercion of other? As I noted my (too long) comment above, using to_datetime coerces ints as well as strings. Old code used the Series constructor which only coerced strings but not ints. Alternatively, we could remove the datetimelike coercion entirely.

name = ops.get_op_result_name(self, other) # noqa
rs_vals = com._where_compat(isna(this), other._values, this._values)
return self._constructor(rs_vals, index=new_index).__finalize__(self)
return this.where(notna(this), other)

def update(self, other):
"""
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/frame/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2936,6 +2936,20 @@ def test_where_callable(self):
tm.assert_frame_equal(result,
(df + 2).where((df + 2) > 8, (df + 2) + 10))

def test_where_tz_values(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use the tz_naive_fixture here

dts1 = date_range('20150101', '20150105', tz='America/New_York')
df1 = DataFrame({'date': dts1})
dts2 = date_range('20150103', '20150107', tz='America/New_York')
df2 = DataFrame({'date': dts2})
mask = DataFrame(True, index=df1.index, columns=df2.columns)
mask.iloc[3:] = False
result = df1.where(mask, df2)
dts3 = DatetimeIndex(['20150101', '20150102', '20150103',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally just contruct what you can w/o using temporaries.

ideally call this expected

'20150106', '20150107'],
tz='America/New_York')
exp = DataFrame({'date': dts3})
assert_frame_equal(exp, result)

def test_mask(self):
df = DataFrame(np.random.randn(5, 3))
cond = df > 0
Expand Down
9 changes: 4 additions & 5 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,12 +580,11 @@ def test_where_series_datetime64(self, fill_val, exp_dtype):
values = pd.Series(pd.date_range(fill_val, periods=4))
if fill_val.tz:
exp = pd.Series([pd.Timestamp('2011-01-01'),
pd.Timestamp('2012-01-02 05:00'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what changed here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous behavior was was treating other for DatetimeTZ as ints (see the writeup for #21546). As a result was getting incorrectly merged with naive datetime Series as naive UTC timestamps. Fixing the behavior of tz-aware other also fixed this and so now the result of combine_first is a promotion of the whole Series to object (as was desired per the deleted xfail/TODO).

pd.Timestamp('2012-01-02 00:00', tz='US/Eastern'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04 05:00')])
self._assert_where_conversion(obj, cond, values, exp,
'datetime64[ns]')
pytest.xfail("ToDo: do not coerce to UTC, must be object")
pd.Timestamp('2012-01-04 00:00',
tz='US/Eastern')])
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

exp = pd.Series([pd.Timestamp('2011-01-01'), values[1],
pd.Timestamp('2011-01-03'), values[3]])
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/series/indexing/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,19 @@ def test_where_datetime_conversion():
assert_series_equal(rs, expected)


def test_where_dt_tz_values():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use the tz_naive_fixture here

dts1 = pd.date_range('20150101', '20150105', tz='America/New_York')
df1 = pd.DataFrame({'date': dts1})
dts2 = pd.date_range('20150103', '20150107', tz='America/New_York')
df2 = pd.DataFrame({'date': dts2})
result = df1.date.where(df1.date < df1.date[3], df2.date)
exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above

'20150106', '20150107'],
tz='America/New_York')
exp = Series(exp_vals, name='date')
assert_series_equal(exp, result)


def test_mask():
# compare with tested results in test_where
s = Series(np.random.randn(5))
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/series/test_combine_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,19 @@ def get_result_type(dtype, dtype2):
]).dtype
assert result.kind == expected

def test_combine_first_dt_tz_values(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use the tz_naive_fixture here

dts1 = pd.date_range('20150101', '20150105', tz='America/New_York')
df1 = pd.DataFrame({'date': dts1})
dts2 = pd.date_range('20160514', '20160518', tz='America/New_York')
df2 = pd.DataFrame({'date': dts2}, index=range(3, 8))
result = df1.date.combine_first(df2.date)
exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

'20150104', '20150105', '20160516',
'20160517', '20160518'],
tz='America/New_York')
exp = pd.Series(exp_vals, name='date')
assert_series_equal(exp, result)

def test_concat_empty_series_dtypes(self):

# booleans
Expand Down