Skip to content

BUG: floats cannot be ranked with tolerance #8379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 3, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ Bug Fixes
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).
- Bug in ``unstack`` with ``TimedeltaIndex`` or ``DatetimeIndex`` and nulls (:issue:`9491`).
- Bug in ``rank`` where comparing floats with tolerance will cause inconsistent behaviour (:issue:`8365`).


- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
Expand Down
21 changes: 4 additions & 17 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ cimport cython
import_array()

cdef float64_t FP_ERR = 1e-13
cdef float64_t REL_TOL = 1e-07

cimport util

Expand Down Expand Up @@ -136,18 +135,6 @@ cdef _take_2d_object(ndarray[object, ndim=2] values,
return result


cdef inline bint float64_are_diff(float64_t left, float64_t right):
cdef double abs_diff, allowed
if right == MAXfloat64 or right == -MAXfloat64:
if left == right:
return False
else:
return True
else:
abs_diff = fabs(left - right)
allowed = REL_TOL * fabs(right)
return abs_diff > allowed

def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
na_option='keep', pct=False):
"""
Expand Down Expand Up @@ -202,7 +189,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True,
ranks[argsorted[i]] = nan
continue
count += 1.0
if i == n - 1 or float64_are_diff(sorted_data[i + 1], val):
if i == n - 1 or sorted_data[i + 1] != val:
if tiebreak == TIEBREAK_AVERAGE:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
Expand Down Expand Up @@ -361,7 +348,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, j]] = nan
continue
count += 1.0
if j == k - 1 or float64_are_diff(values[i, j + 1], val):
if j == k - 1 or values[i, j + 1] != val:
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
Expand Down Expand Up @@ -1087,7 +1074,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
sum_wt = 1.
sum_wt2 = 1.
old_wt = 1.

for i from 1 <= i < N:
cur_x = input_x[i]
cur_y = input_y[i]
Expand Down Expand Up @@ -1117,7 +1104,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
elif is_observation:
mean_x = cur_x
mean_y = cur_y

if nobs >= minp:
if not bias:
numerator = sum_wt * sum_wt
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4707,7 +4707,7 @@ def test_rank(self):
assert_series_equal(iranks, exp)

iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20+1e-30, 1e-1])
exp = Series([2, 1, 3.5, 5, 3.5, 6])
exp = Series([2, 1, 3, 5, 4, 6.0])
iranks = iseries.rank()
assert_series_equal(iranks, exp)

Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,43 @@ def _check(s, expected, method='average'):
series = s if dtype is None else s.astype(dtype)
_check(series, results[method], method=method)

def test_rank_methods_series(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
from scipy.stats import rankdata

xs = np.random.randn(9)
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
np.random.shuffle(xs)

index = [chr(ord('a') + i) for i in range(len(xs))]

for vals in [xs, xs + 1e6, xs * 1e-6]:
ts = Series(vals, index=index)

for m in ['average', 'min', 'max', 'first', 'dense']:
result = ts.rank(m)
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
tm.assert_series_equal(result, Series(sprank, index=index))

def test_rank_methods_frame(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
from scipy.stats import rankdata

xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)

for ax in [0, 1]:
for m in ['average', 'min', 'max', 'first', 'dense']:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(rankdata, ax, vals,
m if m != 'first' else 'ordinal')
expected = DataFrame(sprank, columns=cols)
tm.assert_frame_equal(result, expected)

def test_rank_dense_method(self):
dtypes = ['O', 'f8', 'i8']
in_out = [([1], [1]),
Expand Down