Skip to content

bug in groupby when key space exceeds int64 bounds #9380

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 31, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bench/bench_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def g():
from pandas.core.groupby import get_group_index


group_index = get_group_index(label_list, shape).astype('i4')
group_index = get_group_index(label_list, shape,
sort=True, xnull=True).astype('i4')

ngroups = np.prod(shape)

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Bug Fixes
- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).


- Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`).
Expand Down
114 changes: 56 additions & 58 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1367,30 +1367,16 @@ def group_info(self):

def _get_compressed_labels(self):
all_labels = [ping.labels for ping in self.groupings]
if self._overflow_possible:
tups = lib.fast_zip(all_labels)
labs, uniques = algos.factorize(tups)
if len(all_labels) > 1:
group_index = get_group_index(all_labels, self.shape,
sort=True, xnull=True)
return _compress_group_index(group_index)

if self.sort:
uniques, labs = _reorder_by_uniques(uniques, labs)
ping = self.groupings[0]
self.compressed = False
self._filter_empty_groups = False

return labs, uniques
else:
if len(all_labels) > 1:
group_index = get_group_index(all_labels, self.shape)
comp_ids, obs_group_ids = _compress_group_index(group_index)
else:
ping = self.groupings[0]
comp_ids = ping.labels
obs_group_ids = np.arange(len(ping.group_index))
self.compressed = False
self._filter_empty_groups = False

return comp_ids, obs_group_ids

@cache_readonly
def _overflow_possible(self):
return _int64_overflow_possible(self.shape)
return ping.labels, np.arange(len(ping.group_index))

@cache_readonly
def ngroups(self):
Expand All @@ -1402,15 +1388,13 @@ def result_index(self):
return MultiIndex.from_arrays(recons, names=self.names)

def get_group_levels(self):
obs_ids = self.group_info[1]
comp_ids, obs_ids, _ = self.group_info

if not self.compressed and len(self.groupings) == 1:
return [self.groupings[0].group_index]

if self._overflow_possible:
recons_labels = [np.array(x) for x in zip(*obs_ids)]
else:
recons_labels = decons_group_index(obs_ids, self.shape)
recons_labels = decons_obs_group_ids(comp_ids, obs_ids,
self.shape, (ping.labels for ping in self.groupings))

name_list = []
for ping, labels in zip(self.groupings, recons_labels):
Expand Down Expand Up @@ -3490,42 +3474,28 @@ def get_splitter(data, *args, **kwargs):
# Misc utilities


def get_group_index(label_list, shape):
def get_group_index(labels, shape, sort, xnull):
"""
For the particular label_list, gets the offsets into the hypothetical list
representing the totally ordered cartesian product of all possible label
combinations.
"""
if len(label_list) == 1:
return label_list[0]

n = len(label_list[0])
group_index = np.zeros(n, dtype=np.int64)
mask = np.zeros(n, dtype=bool)
for i in range(len(shape)):
stride = np.prod([x for x in shape[i + 1:]], dtype=np.int64)
group_index += com._ensure_int64(label_list[i]) * stride
mask |= label_list[i] < 0

np.putmask(group_index, mask, -1)
return group_index


def get_flat_ids(labels, shape, retain_lex_rank):
"""
Given a list of labels at each level, returns a flat array of int64 ids
corresponding to unique tuples across the labels. If `retain_lex_rank`,
rank of returned ids preserve lexical ranks of labels.
combinations, *as long as* this space fits within int64 bounds;
otherwise, though group indices identify unique combinations of
labels, they cannot be deconstructed.
- If `sort`, rank of returned ids preserve lexical ranks of labels.
i.e. returned id's can be used to do lexical sort on labels;
- If `xnull` nulls (-1 labels) are passed through.

Parameters
----------
labels: sequence of arrays
Integers identifying levels at each location
shape: sequence of ints same length as labels
Number of unique levels at each location
retain_lex_rank: boolean
sort: boolean
If the ranks of returned ids should match lexical ranks of labels

xnull: boolean
If true nulls are eXcluded. i.e. -1 values in the labels are
passed through
Returns
-------
An array of type int64 where two elements are equal if their corresponding
Expand All @@ -3544,12 +3514,18 @@ def loop(labels, shape):
stride //= shape[i]
out += labels[i] * stride

if xnull: # exclude nulls
mask = labels[0] == -1
for lab in labels[1:nlev]:
mask |= lab == -1
out[mask] = -1

if nlev == len(shape): # all levels done!
return out

# compress what has been done so far in order to avoid overflow
# to retain lexical ranks, obs_ids should be sorted
comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)
comp_ids, obs_ids = _compress_group_index(out, sort=sort)

labels = [comp_ids] + labels[nlev:]
shape = [len(obs_ids)] + shape[nlev:]
Expand All @@ -3560,9 +3536,10 @@ def maybe_lift(lab, size): # pormote nan values
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

labels = map(com._ensure_int64, labels)
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
if not xnull:
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

return loop(labels, shape)
return loop(list(labels), list(shape))


_INT64_MAX = np.iinfo(np.int64).max
Expand All @@ -3578,6 +3555,11 @@ def _int64_overflow_possible(shape):

def decons_group_index(comp_labels, shape):
# reconstruct labels
if _int64_overflow_possible(shape):
# at some point group indices are factorized,
# and may not be deconstructed here! wrong path!
raise ValueError('cannot deconstruct factorized group indices!')

label_list = []
factor = 1
y = 0
Expand All @@ -3591,12 +3573,25 @@ def decons_group_index(comp_labels, shape):
return label_list[::-1]


def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
"""reconstruct labels from observed ids"""
from pandas.hashtable import unique_label_indices

if not _int64_overflow_possible(shape):
# obs ids are deconstructable! take the fast route!
return decons_group_index(obs_ids, shape)

i = unique_label_indices(comp_ids)
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
return [i8copy(lab[i]) for lab in labels]


def _indexer_from_factorized(labels, shape, compress=True):
if _int64_overflow_possible(shape):
indexer = np.lexsort(np.array(labels[::-1]))
return indexer

group_index = get_group_index(labels, shape)
group_index = get_group_index(labels, shape, sort=True, xnull=True)

if compress:
comp_ids, obs_ids = _compress_group_index(group_index)
Expand Down Expand Up @@ -3712,9 +3707,12 @@ def get_key(self, comp_id):

def _get_indices_dict(label_list, keys):
shape = list(map(len, keys))
ngroups = np.prod(shape)

group_index = get_group_index(label_list, shape)
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
ngroups = ((group_index.size and group_index.max()) + 1) \
if _int64_overflow_possible(shape) \
else np.prod(shape, dtype='i8')

sorter = _get_group_index_sorter(group_index, ngroups)

sorted_labels = [lab.take(sorter) for lab in label_list]
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3229,11 +3229,11 @@ def is_unique(self):

@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
from pandas.core.groupby import get_flat_ids
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64

shape = map(len, self.levels)
ids = get_flat_ids(self.labels, shape, False)
ids = get_group_index(self.labels, shape, sort=False, xnull=False)

return duplicated_int64(ids, take_last)

Expand Down
18 changes: 8 additions & 10 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from pandas.core.categorical import Categorical
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
isnull)
from pandas.core.groupby import (get_group_index, _compress_group_index,
decons_group_index)
from pandas.core.groupby import get_group_index, _compress_group_index

import pandas.core.common as com
import pandas.algos as algos

Expand Down Expand Up @@ -103,10 +103,6 @@ def _make_sorted_values_labels(self):
sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

comp_index, obs_ids = get_compressed_ids(to_sort, sizes)

# group_index = get_group_index(to_sort, sizes)
# comp_index, obs_ids = _compress_group_index(group_index)

ngroups = len(obs_ids)

indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
Expand Down Expand Up @@ -252,6 +248,8 @@ def _make_new_index(lev, lab):


def _unstack_multiple(data, clocs):
from pandas.core.groupby import decons_obs_group_ids

if len(clocs) == 0:
return data

Expand All @@ -271,10 +269,10 @@ def _unstack_multiple(data, clocs):
rnames = [index.names[i] for i in rlocs]

shape = [len(x) for x in clevels]
group_index = get_group_index(clabels, shape)
group_index = get_group_index(clabels, shape, sort=False, xnull=False)

comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
recons_labels = decons_group_index(obs_ids, shape)
recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)

dummy_index = MultiIndex(levels=rlevels + [obs_ids],
labels=rlabels + [comp_ids],
Expand Down Expand Up @@ -449,9 +447,9 @@ def _unstack_frame(obj, level):


def get_compressed_ids(labels, sizes):
from pandas.core.groupby import get_flat_ids
from pandas.core.groupby import get_group_index

ids = get_flat_ids(labels, sizes, True)
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return _compress_group_index(ids, sort=True)


Expand Down
33 changes: 32 additions & 1 deletion pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ cnp.import_array()
cnp.import_ufunc()

cdef int64_t iNaT = util.get_nat()
_SIZE_HINT_LIMIT = (1 << 20) + 7

cdef extern from "datetime.h":
bint PyDateTime_Check(object o)
Expand Down Expand Up @@ -1073,7 +1074,7 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
kh_int64_t * table = kh_init_int64()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_int64(table, min(1 << 20, n))
kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))

if take_last:
for i from n > i >=0:
Expand All @@ -1086,3 +1087,33 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):

kh_destroy_int64(table)
return out


@cython.wraparound(False)
@cython.boundscheck(False)
def unique_label_indices(ndarray[int64_t, ndim=1] labels):
"""
indices of the first occurrences of the unique labels
*excluding* -1. equivelent to:
np.unique(labels, return_index=True)[1]
"""
cdef:
int ret = 0
Py_ssize_t i, n = len(labels)
kh_int64_t * table = kh_init_int64()
Int64Vector idx = Int64Vector()
ndarray[int64_t, ndim=1] arr

kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))

for i in range(n):
kh_put_int64(table, labels[i], &ret)
if ret != 0:
idx.append(i)

kh_destroy_int64(table)

arr = idx.to_array()
arr = arr[labels[arr].argsort()]

return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
15 changes: 15 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,21 @@ def test_quantile():
expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
tm.assert_almost_equal(result, expected)

def test_unique_label_indices():
from pandas.hashtable import unique_label_indices

a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

left = unique_label_indices(a)
right = np.unique(a, return_index=True)[1]

tm.assert_array_equal(left, right)

a[np.random.choice(len(a), 10)] = -1
left= unique_label_indices(a)
right = np.unique(a, return_index=True)[1][1:]
tm.assert_array_equal(left, right)

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
Loading