Skip to content

[ENH] Move intersection functions for DatetimeIndex and TimedeltaIndex to Datetimelike and added new tests #25913

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Apr 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Other Enhancements
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
- :meth:`RangeIndex.union` now supports the ``sort`` argument. If ``sort=False`` an unsorted ``Int64Index`` is always returned. ``sort=None`` is the default and returns a mononotically increasing ``RangeIndex`` if possible or a sorted ``Int64Index`` if not (:issue:`24471`)
- :meth:`TimedeltaIndex.intersection` now also supports the ``sort`` keyword (:issue:`24471`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
Expand Down Expand Up @@ -275,9 +276,9 @@ Datetimelike
Timedelta
^^^^^^^^^

- Bug in :func:`TimedeltaIndex.intersection` where for non-monotonic indices in some cases an empty ``Index`` was returned when in fact an intersection existed (:issue:`25913`)
- Bug with comparisons between :class:`Timedelta` and ``NaT`` raising ``TypeError`` (:issue:`26039`)
-
-

Timezones
^^^^^^^^^
Expand Down
57 changes: 57 additions & 0 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from pandas.core.tools.timedeltas import to_timedelta

import pandas.io.formats.printing as printing
from pandas.tseries.frequencies import to_offset

_index_doc_kwargs = dict(ibase._index_doc_kwargs)

Expand Down Expand Up @@ -529,6 +530,62 @@ def isin(self, values):

return algorithms.isin(self.asi8, values.asi8)

def intersection(self, other, sort=False):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)

if self.equals(other):
return self._get_reconciled_name_object(other)

if len(self) == 0:
return self.copy()
if len(other) == 0:
return other.copy()

if not isinstance(other, type(self)):
result = Index.intersection(self, other, sort=sort)
if isinstance(result, type(self)):
if result.freq is None:
result.freq = to_offset(result.inferred_freq)
return result

elif (other.freq is None or self.freq is None or
other.freq != self.freq or
not other.freq.isAnchored() or
(not self.is_monotonic or not other.is_monotonic)):
result = Index.intersection(self, other, sort=sort)

# Invalidate the freq of `result`, which may not be correct at
# this point, depending on the values.
result.freq = None
if hasattr(self, 'tz'):
result = self._shallow_copy(result._values, name=result.name,
tz=result.tz, freq=None)
else:
result = self._shallow_copy(result._values, name=result.name,
freq=None)
if result.freq is None:
result.freq = to_offset(result.inferred_freq)
return result

# to make our life easier, "sort" the two ranges
if self[0] <= other[0]:
left, right = self, other
else:
left, right = other, self

# after sorting, the intersection always starts with the right index
# and ends with the index of which the last elements is smallest
end = min(left[-1], right[-1])
start = right[0]

if end < start:
return type(self)(data=[])
else:
lslice = slice(*left.slice_locs(start, end))
left_chunk = left.values[lslice]
return self._shallow_copy(left_chunk)

@Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
Expand Down
63 changes: 7 additions & 56 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,14 +607,10 @@ def _fast_union(self, other, sort=None):
else:
return left

def _wrap_setop_result(self, other, result):
name = get_op_result_name(self, other)
return self._shallow_copy(result, name=name, freq=None, tz=self.tz)

def intersection(self, other, sort=False):
"""
Specialized intersection for DatetimeIndex objects. May be much faster
than Index.intersection
Specialized intersection for DatetimeIndex objects.
May be much faster than Index.intersection

Parameters
----------
Expand All @@ -631,58 +627,13 @@ def intersection(self, other, sort=False):

Returns
-------
y : Index or DatetimeIndex
y : Index or DatetimeIndex or TimedeltaIndex
"""
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)

if self.equals(other):
return self._get_reconciled_name_object(other)

if not isinstance(other, DatetimeIndex):
try:
other = DatetimeIndex(other)
except (TypeError, ValueError):
pass
result = Index.intersection(self, other, sort=sort)
if isinstance(result, DatetimeIndex):
if result.freq is None:
result.freq = to_offset(result.inferred_freq)
return result

elif (other.freq is None or self.freq is None or
other.freq != self.freq or
not other.freq.isAnchored() or
(not self.is_monotonic or not other.is_monotonic)):
result = Index.intersection(self, other, sort=sort)
# Invalidate the freq of `result`, which may not be correct at
# this point, depending on the values.
result.freq = None
result = self._shallow_copy(result._values, name=result.name,
tz=result.tz, freq=None)
if result.freq is None:
result.freq = to_offset(result.inferred_freq)
return result

if len(self) == 0:
return self
if len(other) == 0:
return other
# to make our life easier, "sort" the two ranges
if self[0] <= other[0]:
left, right = self, other
else:
left, right = other, self

end = min(left[-1], right[-1])
start = right[0]
return super(DatetimeIndex, self).intersection(other, sort=sort)

if end < start:
return type(self)(data=[])
else:
lslice = slice(*left.slice_locs(start, end))
left_chunk = left.values[lslice]
return self._shallow_copy(left_chunk)
def _wrap_setop_result(self, other, result):
name = get_op_result_name(self, other)
return self._shallow_copy(result, name=name, freq=None, tz=self.tz)

# --------------------------------------------------------------------

Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,10 @@ def join(self, other, how='left', level=None, return_indexers=False,
return self._apply_meta(result), lidx, ridx
return self._apply_meta(result)

@Appender(Index.intersection.__doc__)
def intersection(self, other, sort=False):
return Index.intersection(self, other, sort=sort)

def _assert_can_do_setop(self, other):
super()._assert_can_do_setop(other)

Expand Down
74 changes: 28 additions & 46 deletions pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,34 @@ def join(self, other, how='left', level=None, return_indexers=False,
return_indexers=return_indexers,
sort=sort)

def intersection(self, other, sort=False):
"""
Specialized intersection for TimedeltaIndex objects.
May be much faster than Index.intersection

Parameters
----------
other : TimedeltaIndex or array-like
sort : False or None, default False
Sort the resulting index if possible.

.. versionadded:: 0.24.0

.. versionchanged:: 0.24.1

Changed the default to ``False`` to match the behaviour
from before 0.24.0.

.. versionchanged:: 0.25.0

The `sort` keyword is added

Returns
-------
y : Index or TimedeltaIndex
"""
return super(TimedeltaIndex, self).intersection(other, sort=sort)

def _wrap_joined_index(self, joined, other):
name = get_op_result_name(self, other)
if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and
Expand Down Expand Up @@ -440,52 +468,6 @@ def _fast_union(self, other):
else:
return left

def intersection(self, other):
"""
Specialized intersection for TimedeltaIndex objects. May be much faster
than Index.intersection

Parameters
----------
other : TimedeltaIndex or array-like

Returns
-------
y : Index or TimedeltaIndex
"""
self._assert_can_do_setop(other)

if self.equals(other):
return self._get_reconciled_name_object(other)

if not isinstance(other, TimedeltaIndex):
try:
other = TimedeltaIndex(other)
except (TypeError, ValueError):
pass
result = Index.intersection(self, other)
return result

if len(self) == 0:
return self
if len(other) == 0:
return other
# to make our life easier, "sort" the two ranges
if self[0] <= other[0]:
left, right = self, other
else:
left, right = other, self

end = min(left[-1], right[-1])
start = right[0]

if end < start:
return type(self)(data=[])
else:
lslice = slice(*left.slice_locs(start, end))
left_chunk = left.values[lslice]
return self._shallow_copy(left_chunk)

def _maybe_promote(self, other):
if other.inferred_type == 'timedelta':
other = TimedeltaIndex(other)
Expand Down
94 changes: 94 additions & 0 deletions pandas/tests/indexes/timedeltas/test_setops.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np
import pytest

import pandas as pd
from pandas import Int64Index, TimedeltaIndex, timedelta_range
import pandas.util.testing as tm

from pandas.tseries.offsets import Hour


class TestTimedeltaIndex:

Expand Down Expand Up @@ -73,3 +76,94 @@ def test_intersection_bug_1708(self):
result = index_1 & index_2
expected = timedelta_range('1 day 01:00:00', periods=3, freq='h')
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("sort", [None, False])
def test_intersection_equal(self, sort):
# GH 24471 Test intersection outcome given the sort keyword
# for equal indicies intersection should return the original index
first = timedelta_range('1 day', periods=4, freq='h')
second = timedelta_range('1 day', periods=4, freq='h')
intersect = first.intersection(second, sort=sort)
if sort is None:
tm.assert_index_equal(intersect, second.sort_values())
assert tm.equalContents(intersect, second)

# Corner cases
inter = first.intersection(first, sort=sort)
assert inter is first

@pytest.mark.parametrize("period_1, period_2", [(0, 4), (4, 0)])
@pytest.mark.parametrize("sort", [None, False])
def test_intersection_zero_length(self, period_1, period_2, sort):
# GH 24471 test for non overlap the intersection should be zero length
index_1 = timedelta_range('1 day', periods=period_1, freq='h')
index_2 = timedelta_range('1 day', periods=period_2, freq='h')
expected = timedelta_range('1 day', periods=0, freq='h')
result = index_1.intersection(index_2, sort=sort)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('sort', [None, False])
def test_zero_length_input_index(self, sort):
# GH 24966 test for 0-len intersections are copied
index_1 = timedelta_range('1 day', periods=0, freq='h')
index_2 = timedelta_range('1 day', periods=3, freq='h')
result = index_1.intersection(index_2, sort=sort)
assert index_1 is not result
assert index_2 is not result
tm.assert_copy(result, index_1)

@pytest.mark.parametrize(
"rng, expected",
# if target has the same name, it is preserved
[
(timedelta_range('1 day', periods=5, freq='h', name='idx'),
timedelta_range('1 day', periods=4, freq='h', name='idx')),
# if target name is different, it will be reset
(timedelta_range('1 day', periods=5, freq='h', name='other'),
timedelta_range('1 day', periods=4, freq='h', name=None)),
# if no overlap exists return empty index
(timedelta_range('1 day', periods=10, freq='h', name='idx')[5:],
TimedeltaIndex([], name='idx'))])
@pytest.mark.parametrize("sort", [None, False])
def test_intersection(self, rng, expected, sort):
# GH 4690 (with tz)
base = timedelta_range('1 day', periods=4, freq='h', name='idx')
result = base.intersection(rng, sort=sort)
if sort is None:
expected = expected.sort_values()
tm.assert_index_equal(result, expected)
assert result.name == expected.name
assert result.freq == expected.freq

@pytest.mark.parametrize(
"rng, expected",
# part intersection works
[
(TimedeltaIndex(['5 hour', '2 hour', '4 hour', '9 hour'],
name='idx'),
TimedeltaIndex(['2 hour', '4 hour'], name='idx')),
# reordered part intersection
(TimedeltaIndex(['2 hour', '5 hour', '5 hour', '1 hour'],
name='other'),
TimedeltaIndex(['1 hour', '2 hour'], name=None)),
# reveresed index
(TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'],
name='idx')[::-1],
TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'],
name='idx'))])
@pytest.mark.parametrize("sort", [None, False])
def test_intersection_non_monotonic(self, rng, expected, sort):
# 24471 non-monotonic
base = TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'],
name='idx')
result = base.intersection(rng, sort=sort)
if sort is None:
expected = expected.sort_values()
tm.assert_index_equal(result, expected)
assert result.name == expected.name

# if reveresed order, frequency is still the same
if all(base == rng[::-1]) and sort is None:
assert isinstance(result.freq, Hour)
else:
assert result.freq is None