-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Retain timezone dtype with cut and qcut #19890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
""" | ||
Quantilization functions and related stuff | ||
""" | ||
from functools import partial | ||
|
||
from pandas.core.dtypes.missing import isna | ||
from pandas.core.dtypes.common import ( | ||
|
@@ -9,6 +10,7 @@ | |
is_categorical_dtype, | ||
is_datetime64_dtype, | ||
is_timedelta64_dtype, | ||
is_datetime64tz_dtype, | ||
_ensure_int64) | ||
|
||
import pandas.core.algorithms as algos | ||
|
@@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, | |
ids = _ensure_int64(bins.searchsorted(x, side=side)) | ||
|
||
if include_lowest: | ||
ids[x == bins[0]] = 1 | ||
# Numpy 1.9 support: ensure this mask is a Numpy array | ||
ids[np.asarray(x == bins[0])] = 1 | ||
|
||
na_mask = isna(x) | (ids == len(bins)) | (ids == 0) | ||
has_nas = na_mask.any() | ||
|
@@ -284,7 +287,9 @@ def _coerce_to_type(x): | |
""" | ||
dtype = None | ||
|
||
if is_timedelta64_dtype(x): | ||
if is_datetime64tz_dtype(x): | ||
dtype = x.dtype | ||
elif is_timedelta64_dtype(x): | ||
x = to_timedelta(x) | ||
dtype = np.timedelta64 | ||
elif is_datetime64_dtype(x): | ||
|
@@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype): | |
|
||
Parameters | ||
---------- | ||
bins : list-liek of bins | ||
bins : list-like of bins | ||
dtype : dtype of data | ||
|
||
Raises | ||
|
@@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True, | |
|
||
closed = 'right' if right else 'left' | ||
|
||
if is_datetime64_dtype(dtype): | ||
if is_datetime64tz_dtype(dtype): | ||
formatter = partial(Timestamp, tz=dtype.tz) | ||
adjust = lambda x: x - Timedelta('1ns') | ||
elif is_datetime64_dtype(dtype): | ||
formatter = Timestamp | ||
adjust = lambda x: x - Timedelta('1ns') | ||
elif is_timedelta64_dtype(dtype): | ||
|
@@ -372,7 +380,11 @@ def _preprocess_for_cut(x): | |
series_index = x.index | ||
name = x.name | ||
|
||
x = np.asarray(x) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can convert it then simply check the ndim <= 1 no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so in that case, prefer to do not do this here, rather move any logic needed to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well the main issue is that
|
||
ndim = getattr(x, 'ndim', None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a comment here on what you are checking |
||
if ndim is None: | ||
x = np.asarray(x) | ||
if x.ndim != 1: | ||
raise ValueError("Input array must be 1 dimensional") | ||
|
||
return x_is_series, series_index, name, x | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
import numpy as np | ||
from pandas.compat import zip | ||
|
||
from pandas import (Series, isna, to_datetime, DatetimeIndex, | ||
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index, | ||
Timestamp, Interval, IntervalIndex, Categorical, | ||
cut, qcut, date_range, NaT, TimedeltaIndex) | ||
from pandas.tseries.offsets import Nano, Day | ||
|
@@ -104,6 +104,12 @@ def test_cut_corner(self): | |
|
||
pytest.raises(ValueError, cut, [1, 2, 3], 0.5) | ||
|
||
@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for an existing test, make sure bins are tested for (should work, but covering bases) scalar, 0-dim ndarray, 1-d ndarray, Series, list, Index There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you mean to test various bins in this specific test ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean find a good test (or 2) and make sure that we are exercising all array-likes as input |
||
@pytest.mark.parametrize('cut_func', [cut, qcut]) | ||
def test_cut_not_1d_arg(self, arg, cut_func): | ||
with pytest.raises(ValueError): | ||
cut_func(arg, 2) | ||
|
||
def test_cut_out_of_range_more(self): | ||
# #1511 | ||
s = Series([0, -1, 0, 1, -3], name='x') | ||
|
@@ -251,18 +257,6 @@ def test_qcut_nas(self): | |
result = qcut(arr, 4) | ||
assert isna(result[:20]).all() | ||
|
||
@pytest.mark.parametrize('s', [ | ||
Series(DatetimeIndex(['20180101', NaT, '20180103'])), | ||
Series(TimedeltaIndex(['0 days', NaT, '2 days']))], | ||
ids=lambda x: str(x.dtype)) | ||
def test_qcut_nat(self, s): | ||
# GH 19768 | ||
intervals = IntervalIndex.from_tuples( | ||
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) | ||
expected = Series(Categorical(intervals, ordered=True)) | ||
result = qcut(s, 2) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_qcut_index(self): | ||
result = qcut([0, 2], 2) | ||
intervals = [Interval(-0.001, 1), Interval(1, 2)] | ||
|
@@ -452,6 +446,37 @@ def test_single_bin(self): | |
result = cut(s, 1, labels=False) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"array_1_writeable, array_2_writeable", | ||
[(True, True), (True, False), (False, False)]) | ||
def test_cut_read_only(self, array_1_writeable, array_2_writeable): | ||
# issue 18773 | ||
array_1 = np.arange(0, 100, 10) | ||
array_1.flags.writeable = array_1_writeable | ||
|
||
array_2 = np.arange(0, 100, 10) | ||
array_2.flags.writeable = array_2_writeable | ||
|
||
hundred_elements = np.arange(100) | ||
|
||
tm.assert_categorical_equal(cut(hundred_elements, array_1), | ||
cut(hundred_elements, array_2)) | ||
|
||
|
||
class TestDatelike(object): | ||
|
||
@pytest.mark.parametrize('s', [ | ||
Series(DatetimeIndex(['20180101', NaT, '20180103'])), | ||
Series(TimedeltaIndex(['0 days', NaT, '2 days']))], | ||
ids=lambda x: str(x.dtype)) | ||
def test_qcut_nat(self, s): | ||
# GH 19768 | ||
intervals = IntervalIndex.from_tuples( | ||
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) | ||
expected = Series(Categorical(intervals, ordered=True)) | ||
result = qcut(s, 2) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_datetime_cut(self): | ||
# GH 14714 | ||
# testing for time data to be present as series | ||
|
@@ -488,6 +513,47 @@ def test_datetime_cut(self): | |
result, bins = cut(data, 3, retbins=True) | ||
tm.assert_series_equal(Series(result), expected) | ||
|
||
@pytest.mark.parametrize('bins', [ | ||
3, [Timestamp('2013-01-01 04:57:07.200000').value, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you doing .value here? the user won't be doing that |
||
Timestamp('2013-01-01 21:00:00').value, | ||
Timestamp('2013-01-02 13:00:00').value, | ||
Timestamp('2013-01-03 05:00:00').value]]) | ||
@pytest.mark.parametrize('const', [list, np.array, Index, Series]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. call this box (and anywhere else you use it) |
||
def test_datetimetz_cut(self, bins, const): | ||
# GH 19872 | ||
tz = 'US/Eastern' | ||
s = Series(date_range('20130101', periods=3, tz=tz)) | ||
if not isinstance(bins, int): | ||
bins = const(bins) | ||
result = cut(s, bins) | ||
expected = ( | ||
Series(IntervalIndex([ | ||
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz), | ||
Timestamp('2013-01-01 16:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-01 16:00:00', tz=tz), | ||
Timestamp('2013-01-02 08:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-02 08:00:00', tz=tz), | ||
Timestamp('2013-01-03 00:00:00', tz=tz))])) | ||
.astype(CDT(ordered=True))) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) | ||
def test_datetimetz_qcut(self, bins): | ||
# GH 19872 | ||
tz = 'US/Eastern' | ||
s = Series(date_range('20130101', periods=3, tz=tz)) | ||
result = qcut(s, bins) | ||
expected = ( | ||
Series(IntervalIndex([ | ||
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz), | ||
Timestamp('2013-01-01 16:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-01 16:00:00', tz=tz), | ||
Timestamp('2013-01-02 08:00:00', tz=tz)), | ||
Interval(Timestamp('2013-01-02 08:00:00', tz=tz), | ||
Timestamp('2013-01-03 00:00:00', tz=tz))])) | ||
.astype(CDT(ordered=True))) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_datetime_bin(self): | ||
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] | ||
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] | ||
|
@@ -523,19 +589,3 @@ def f(): | |
mask = result.isna() | ||
tm.assert_numpy_array_equal( | ||
mask, np.array([False, True, True, True, True])) | ||
|
||
@pytest.mark.parametrize( | ||
"array_1_writeable, array_2_writeable", | ||
[(True, True), (True, False), (False, False)]) | ||
def test_cut_read_only(self, array_1_writeable, array_2_writeable): | ||
# issue 18773 | ||
array_1 = np.arange(0, 100, 10) | ||
array_1.flags.writeable = array_1_writeable | ||
|
||
array_2 = np.arange(0, 100, 10) | ||
array_2.flags.writeable = array_2_writeable | ||
|
||
hundred_elements = np.arange(100) | ||
|
||
tm.assert_categorical_equal(cut(hundred_elements, array_1), | ||
cut(hundred_elements, array_2)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
put the is_datetime after this case (its below now)