Skip to content

BUG: Fix overflow bugs in date_Range #24255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 23, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 95 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,7 +1747,8 @@ def _generate_regular_range(cls, start, end, periods, freq):
return data


def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
def _generate_range_overflow_safe(endpoint, periods, stride,
side='start'):
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you doc-string this a bit more (say what the parameters mean)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

Expand All @@ -1770,12 +1771,78 @@ def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ['start', 'end']

i64max = np.uint64(np.iinfo(np.int64).max)
msg = ('Cannot generate range with {side}={endpoint} and '
'periods={periods}'
.format(side=side, endpoint=endpoint, periods=periods))

with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError:
raise tslib.OutOfBoundsDatetime(msg)

if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(
endpoint, periods, stride, side)

elif ((endpoint > 0 and side == 'start') or
(endpoint < 0 and side == 'end')):
# no chance of not-overflowing
raise tslib.OutOfBoundsDatetime(msg)

elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max):
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(endpoint - stride,
periods - 1, stride, side)

# split into smaller pieces
return _generate_range_recurse(endpoint, periods, stride, side)


def _generate_range_overflow_safe_signed(endpoint, periods, stride, side):
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ['start', 'end']
if side == 'end':
stride *= -1

with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
return np.int64(endpoint) + addend
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass

if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
if endpoint >= 0:
result = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(np.iinfo(np.int64).max)
if result <= i64max + np.uint64(stride):
return result
else:
return _generate_range_recurse(endpoint, periods,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are these 2 cases?

why is this recursion? why can't you do this iteratively?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this recursion? why can't you do this iteratively?

generate_range_recurse breaks the problem down into two smaller pieces and calls _generate_range_overflow_safe twice. It is iterative in the sense that it makes two calls, as opposed to repeated calls to itself. It is recursive inasmuch as it is called via _generate_range_overflow_safe, which it also calls. I'm open to other ideas for names.

what are these 2 cases?

Which two cases are you asking about?

np.abs(stride), side)
elif stride < 0 and endpoint > 0:
return _generate_range_recurse(np.uint64(endpoint), periods,
np.abs(stride), side)

try:
other_end = checked_add_with_arr(np.int64(endpoint),
np.int64(periods) * stride)
addend)
except OverflowError:
raise tslib.OutOfBoundsDatetime('Cannot generate range with '
'{side}={endpoint} and '
Expand All @@ -1785,6 +1852,32 @@ def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
return other_end


def _generate_range_recurse(endpoint, periods, stride, side):
"""
Avoid problems in int64/uint64 mismatch by splitting range generation into
smaller pieces.

Parameters
----------
endpoint : int
periods : int
stride : int
side : {'start', 'end'}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so pls move all of this new code to the same place as checked_add_with_arr

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

algos seems like a weird place to put datetime-specific code, no? I guess in principle this code could be used for generating int64 ranges...

Returns
-------
other_end : int
"""
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)

midpoint = _generate_range_overflow_safe(endpoint, mid_periods,
stride, side)
return _generate_range_overflow_safe(midpoint, remaining, stride, side)


# -------------------------------------------------------------------
# Validation and Inference

Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/indexes/datetimes/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,33 @@ def test_date_range_nat(self):
with pytest.raises(ValueError, match=msg):
date_range(start=pd.NaT, end='2016-01-01', freq='D')

def test_date_range_multiplication_overflow(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so 2 test cases fully exercise all of this code???

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, I'm tracking down untested cases now.

# GH#24255
# check that overflows in calculating `addend = periods * stride`
# are caught
with tm.assert_produces_warning(None):
# we should _not_ be seeing a overflow RuntimeWarning
dti = date_range(start='1677-09-22', periods=213503, freq='D')

assert dti[0] == Timestamp('1677-09-22')
assert len(dti) == 213503

msg = "Cannot generate range with"
with pytest.raises(OutOfBoundsDatetime, match=msg):
date_range('1969-05-04', periods=200000000, freq='30000D')

def test_date_range_unsigned_overflow_handling(self):
# GH#24255
# case where `addend = periods * stride` overflows int64 bounds
# but not uint64 bounds
dti = date_range(start='1677-09-22', end='2262-04-11', freq='D')

dti2 = date_range(start=dti[0], periods=len(dti), freq='D')
assert dti2.equals(dti)

dti3 = date_range(end=dti[-1], periods=len(dti), freq='D')
assert dti3.equals(dti)

def test_date_range_out_of_bounds(self):
# GH#14187
with pytest.raises(OutOfBoundsDatetime):
Expand Down