Skip to content

PERF: improve DTI string parse #13692

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ Performance Improvements
- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`)
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
- Improved performance of ``Index.difference`` (:issue:`12044`)
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)

.. _whatsnew_0190.bug_fixes:

Expand Down Expand Up @@ -631,6 +632,7 @@ Bug Fixes
- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)



- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)


Expand All @@ -654,6 +656,8 @@ Bug Fixes

- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`)
- Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`)
- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`)
- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`)

- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2440,7 +2440,7 @@ def converter(*date_cols):
strs = _concat_date_cols(date_cols)

try:
return tools._to_datetime(
return tools.to_datetime(
_ensure_object(strs),
utc=None,
box=False,
Expand Down
31 changes: 9 additions & 22 deletions pandas/tests/indexes/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self):
self.assert_index_equal(result, exp, exact=True)
self.assertFalse(isinstance(result, DatetimeIndex))

# passing tz results in DatetimeIndex
result = Index([Timestamp('2011-01-01 10:00'),
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
tz='Asia/Tokyo', name='idx')
exp = DatetimeIndex([Timestamp('2011-01-01 19:00'),
Timestamp('2011-01-03 00:00')],
tz='Asia/Tokyo', name='idx')
self.assert_index_equal(result, exp, exact=True)
self.assertTrue(isinstance(result, DatetimeIndex))

# length = 1
result = Index([Timestamp('2011-01-01')], name='idx')
exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx')
Expand Down Expand Up @@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self):
self.assert_index_equal(result, exp, exact=True)
self.assertFalse(isinstance(result, DatetimeIndex))

# passing tz results in DatetimeIndex
result = Index([pd.NaT, Timestamp('2011-01-01 10:00'),
pd.NaT, Timestamp('2011-01-02 10:00',
tz='US/Eastern')],
tz='Asia/Tokyo', name='idx')
exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'),
pd.NaT, Timestamp('2011-01-03 00:00')],
tz='Asia/Tokyo', name='idx')
self.assert_index_equal(result, exp, exact=True)
self.assertTrue(isinstance(result, DatetimeIndex))

# all NaT
result = Index([pd.NaT, pd.NaT], name='idx')
exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx')
Expand Down Expand Up @@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self):
self.assertTrue(isinstance(result, DatetimeIndex))

# tz mismatch affecting to tz-aware raises TypeError/ValueError

with tm.assertRaises(ValueError):
DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'),
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
name='idx')

with tm.assertRaises(TypeError):
with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'):
DatetimeIndex([Timestamp('2011-01-01 10:00'),
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
tz='Asia/Tokyo', name='idx')
Expand All @@ -338,6 +318,13 @@ def test_construction_dti_with_mixed_timezones(self):
Timestamp('2011-01-02 10:00', tz='US/Eastern')],
tz='US/Eastern', name='idx')

with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'):
# passing tz should results in DatetimeIndex, then mismatch raises
# TypeError
Index([pd.NaT, Timestamp('2011-01-01 10:00'),
pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')],
tz='Asia/Tokyo', name='idx')

def test_construction_base_constructor(self):
arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]
tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr))
Expand Down
93 changes: 17 additions & 76 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,55 +292,32 @@ def __new__(cls, data=None,
raise ValueError('DatetimeIndex() must be called with a '
'collection of some kind, %s was passed'
% repr(data))

# other iterable of some kind
if not isinstance(data, (list, tuple)):
data = list(data)

data = np.asarray(data, dtype='O')
elif isinstance(data, ABCSeries):
data = data._values

# try a few ways to make it datetime64
if lib.is_string_array(data):
data = tslib.parse_str_array_to_datetime(data, freq=freq,
dayfirst=dayfirst,
yearfirst=yearfirst)
else:
data = tools.to_datetime(data, errors='raise')
data.offset = freq
if isinstance(data, DatetimeIndex):
if name is not None:
data.name = name

if tz is not None:

# we might already be localized to this tz
# so passing the same tz is ok
# however any other tz is a no-no
if data.tz is None:
return data.tz_localize(tz, ambiguous=ambiguous)
elif str(tz) != str(data.tz):
raise TypeError("Already tz-aware, use tz_convert "
"to convert.")

return data._deepcopy_if_needed(ref_to_data, copy)

if issubclass(data.dtype.type, compat.string_types):
data = tslib.parse_str_array_to_datetime(data, freq=freq,
dayfirst=dayfirst,
yearfirst=yearfirst)
# data must be Index or np.ndarray here
if not (is_datetime64_dtype(data) or is_datetimetz(data) or
is_integer_dtype(data)):
data = tools.to_datetime(data, dayfirst=dayfirst,
yearfirst=yearfirst)

if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason not to use is_datetime64_dtype here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to normal datetime64, datetime64tz (DatetimeIndex) and int can be directly converted to DatetimeIndex.

if isinstance(data, ABCSeries):
data = data._values

if isinstance(data, DatetimeIndex):
if tz is None:
tz = data.tz

elif data.tz is None:
data = data.tz_localize(tz, ambiguous=ambiguous)
else:
# the tz's must match
if str(tz) != str(data.tz):
raise TypeError("Already tz-aware, use tz_convert "
"to convert.")
msg = ('data is already tz-aware {0}, unable to '
'set specified tz: {1}')
raise TypeError(msg.format(data.tz, tz))

subarr = data.values

Expand All @@ -356,35 +333,6 @@ def __new__(cls, data=None,
if isinstance(data, Int64Index):
raise TypeError('cannot convert Int64Index->DatetimeIndex')
subarr = data.view(_NS_DTYPE)
else:
if isinstance(data, (ABCSeries, Index)):
values = data._values
else:
values = data

if lib.is_string_array(values):
subarr = tslib.parse_str_array_to_datetime(
values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst)
else:
try:
subarr = tools.to_datetime(data, box=False)

# make sure that we have a index/ndarray like (and not a
# Series)
if isinstance(subarr, ABCSeries):
subarr = subarr._values
if subarr.dtype == np.object_:
subarr = tools._to_datetime(subarr, box=False)

except ValueError:
# tz aware
subarr = tools._to_datetime(data, box=False, utc=True)

# we may not have been able to convert
if not (is_datetimetz(subarr) or
np.issubdtype(subarr.dtype, np.datetime64)):
raise ValueError('Unable to convert %s to datetime dtype'
% str(data))

if isinstance(subarr, DatetimeIndex):
if tz is None:
Expand All @@ -399,27 +347,21 @@ def __new__(cls, data=None,
ints = subarr.view('i8')
subarr = tslib.tz_localize_to_utc(ints, tz,
ambiguous=ambiguous)

subarr = subarr.view(_NS_DTYPE)

subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz)

# if dtype is provided, coerce here
if dtype is not None:

if not is_dtype_equal(subarr.dtype, dtype):

# dtype must be coerced to DatetimeTZDtype above
if subarr.tz is not None:
raise ValueError("cannot localize from non-UTC data")
dtype = DatetimeTZDtype.construct_from_string(dtype)
subarr = subarr.tz_localize(dtype.tz)

if verify_integrity and len(subarr) > 0:
if freq is not None and not freq_infer:
inferred = subarr.inferred_freq
if inferred != freq.freqstr:
on_freq = cls._generate(subarr[0], None, len(
subarr), None, freq, tz=tz, ambiguous=ambiguous)
on_freq = cls._generate(subarr[0], None, len(subarr), None,
freq, tz=tz, ambiguous=ambiguous)
if not np.array_equal(subarr.asi8, on_freq.asi8):
raise ValueError('Inferred frequency {0} from passed '
'dates does not conform to passed '
Expand Down Expand Up @@ -563,7 +505,6 @@ def _generate(cls, start, end, periods, name, offset,
index = index[1:]
if not right_closed and len(index) and index[-1] == end:
index = index[:-1]

index = cls._simple_new(index, name=name, freq=offset, tz=tz)
return index

Expand Down Expand Up @@ -669,7 +610,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None,
xdr = generate_range(offset=offset, start=_CACHE_START,
end=_CACHE_END)

arr = tools._to_datetime(list(xdr), box=False)
arr = tools.to_datetime(list(xdr), box=False)

cachedRange = DatetimeIndex._simple_new(arr)
cachedRange.offset = offset
Expand Down
7 changes: 6 additions & 1 deletion pandas/tseries/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,12 @@ def _get_binner_for_grouping(self, obj):
l = []
for key, group in grouper.get_iterator(self.ax):
l.extend([key] * len(group))
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe isolate this a bit with some functions like _get_binner_for_resample does (maybe we should abstract this out even a bit more and have some PeriodIndexGrouper, DatetimeinexGrouper, which subclass TimeGrouper, but this might require some effort).

if isinstance(self.ax, PeriodIndex):
grouper = binner.__class__(l, freq=binner.freq, name=binner.name)
else:
# resampling causes duplicated values, specifying freq is invalid
grouper = binner.__class__(l, name=binner.name)

# since we may have had to sort
# may need to reorder groups here
Expand Down
5 changes: 3 additions & 2 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4087,8 +4087,9 @@ def test_dti_set_index_reindex(self):

# 11314
# with tz
index = date_range(datetime(2015, 10, 1), datetime(
2015, 10, 1, 23), freq='H', tz='US/Eastern')
index = date_range(datetime(2015, 10, 1),
datetime(2015, 10, 1, 23),
freq='H', tz='US/Eastern')
df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
new_index = date_range(datetime(2015, 10, 2),
datetime(2015, 10, 2, 23),
Expand Down
22 changes: 13 additions & 9 deletions pandas/tseries/tests/test_tslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import datetime

import pandas as pd
from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime
from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period,
to_datetime)
from pandas.tslib import get_timezone
from pandas._period import period_asfreq, period_ordinal
from pandas.tseries.index import date_range, DatetimeIndex
Expand Down Expand Up @@ -698,14 +699,19 @@ def test_parsers(self):
yearfirst=yearfirst)
result2 = to_datetime(date_str, yearfirst=yearfirst)
result3 = to_datetime([date_str], yearfirst=yearfirst)
# result5 is used below
result4 = to_datetime(np.array([date_str], dtype=object),
yearfirst=yearfirst)
result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0]
self.assertEqual(result1, expected)
self.assertEqual(result2, expected)
self.assertEqual(result3, expected)
self.assertEqual(result4, expected)
self.assertEqual(result6, expected)
result6 = DatetimeIndex([date_str], yearfirst=yearfirst)
# result7 is used below
result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst)
result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst)

for res in [result1, result2]:
self.assertEqual(res, expected)
for res in [result3, result4, result6, result8, result9]:
exp = DatetimeIndex([pd.Timestamp(expected)])
tm.assert_index_equal(res, exp)

# these really need to have yearfist, but we don't support
if not yearfirst:
Expand Down Expand Up @@ -893,9 +899,7 @@ def test_parsers_monthfreq(self):

for date_str, expected in compat.iteritems(cases):
result1, _, _ = tools.parse_time_string(date_str, freq='M')
result2 = tools._to_datetime(date_str, freq='M')
self.assertEqual(result1, expected)
self.assertEqual(result2, expected)

def test_parsers_quarterly_with_freq(self):
msg = ('Incorrect quarterly string is given, quarter '
Expand Down
Loading