Skip to content

PERF: improved performance of small multiindexes #16324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,15 @@ def setup(self):
np.arange(1000)], names=['one', 'two'])

import string
self.mistring = MultiIndex.from_product(
[np.arange(1000),
np.arange(20), list(string.ascii_letters)],

self.mi_large = MultiIndex.from_product(
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
names=['one', 'two', 'three'])
self.mi_med = MultiIndex.from_product(
[np.arange(1000), np.arange(10), list('A')],
names=['one', 'two', 'three'])
self.mi_small = MultiIndex.from_product(
[np.arange(100), list('A'), list('A')],
names=['one', 'two', 'three'])

def time_series_xs_mi_ix(self):
Expand All @@ -218,8 +224,14 @@ def time_multiindex_get_indexer(self):
(0, 16), (0, 17), (0, 18),
(0, 19)], dtype=object))

def time_multiindex_large_get_loc(self):
self.mi_large.get_loc((999, 19, 'Z'))

def time_multiindex_med_get_loc(self):
self.mi_med.get_loc((999, 9, 'A'))

def time_multiindex_string_get_loc(self):
self.mistring.get_loc((999, 19, 'Z'))
self.mi_small.get_loc((99, 'A', 'A'))

def time_is_monotonic(self):
self.miint.is_monotonic
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance regression fix when indexing with a list-like (:issue:`16285`)

- Performance regression fix for small MultiIndexes (:issuse:`16319`)

.. _whatsnew_0202.bug_fixes:

Expand Down
33 changes: 32 additions & 1 deletion pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz):
return tz is UTC or isinstance(tz, _du_utc)


cdef class MultiIndexEngine(IndexEngine):
cdef class MultiIndexObjectEngine(ObjectEngine):
"""
provide the same interface as the MultiIndexEngine
but use the IndexEngine for computation
This provides good performance with samller MI's
"""
def get_indexer(self, values):
# convert a MI to an ndarray
if hasattr(values, 'values'):
values = values.values
return super(MultiIndexObjectEngine, self).get_indexer(values)

cpdef get_loc(self, object val):

# convert a MI to an ndarray
if hasattr(val, 'values'):
val = val.values
return super(MultiIndexObjectEngine, self).get_loc(val)


cdef class MultiIndexHashEngine(ObjectEngine):
"""
Use a hashing based MultiIndex impl
but use the IndexEngine for computation
This provides good performance with larger MI's
"""

def _call_monotonic(self, object mi):
# defer these back to the mi iteself
Expand Down Expand Up @@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine):
except TypeError:
raise KeyError(val)

def get_indexer(self, values):
self._ensure_mapping_populated()
return self.mapping.lookup(values)

cdef _make_hash_table(self, n):
return _hash.MultiIndexHashTable(n)

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,12 @@ def is_dtype(cls, dtype):
"""
if hasattr(dtype, 'dtype'):
dtype = dtype.dtype
if isinstance(dtype, cls):
return True
elif isinstance(dtype, np.dtype):
if isinstance(dtype, np.dtype):
return False
elif dtype is None:
return False
elif isinstance(dtype, cls):
return True
try:
return cls.construct_from_string(dtype) is not None
except:
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class MultiIndex(Index):
_levels = FrozenList()
_labels = FrozenList()
_comparables = ['names']
_engine_type = libindex.MultiIndexEngine
rename = Index.set_names

def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
Expand Down Expand Up @@ -629,7 +628,16 @@ def _get_level_number(self, level):

@cache_readonly
def _engine(self):
return self._engine_type(lambda: self, len(self))

# choose our engine based on our size
# the hashing based MultiIndex for larger
# sizes, and the MultiIndexOjbect for smaller
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe refer to the github issue number of the pr for the discussion about it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

# xref: https://github.com/pandas-dev/pandas/pull/16324
l = len(self)
if l > 10000:
return libindex.MultiIndexHashEngine(lambda: self, l)

return libindex.MultiIndexObjectEngine(lambda: self.values, l)

@property
def values(self):
Expand Down
18 changes: 7 additions & 11 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,13 @@

import numpy as np
from pandas._libs import hashing
from pandas._libs.lib import is_bool_array
from pandas.core.dtypes.generic import (
ABCMultiIndex,
ABCIndexClass,
ABCSeries,
ABCDataFrame)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_numeric_dtype,
is_datetime64_dtype, is_timedelta64_dtype,
is_list_like)
is_categorical_dtype, is_list_like)

# 16 byte long hashing key
_default_hash_key = '0123456789123456'
Expand Down Expand Up @@ -136,7 +133,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
-------
ndarray of hashed values array
"""

is_tuple = False
if isinstance(vals, tuple):
vals = [vals]
Expand Down Expand Up @@ -231,29 +227,29 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):

if not hasattr(vals, 'dtype'):
raise TypeError("must pass a ndarray-like")
dtype = vals.dtype

if hash_key is None:
hash_key = _default_hash_key

# For categoricals, we hash the categories, then remap the codes to the
# hash values. (This check is above the complex check so that we don't ask
# numpy if categorical is a subdtype of complex, as it will choke.
if is_categorical_dtype(vals.dtype):
if is_categorical_dtype(dtype):
return _hash_categorical(vals, encoding, hash_key)

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
elif np.issubdtype(dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if is_bool_array(vals):
elif isinstance(dtype, np.bool):
vals = vals.astype('u8')
elif (is_datetime64_dtype(vals) or
is_timedelta64_dtype(vals)):
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view('i8').astype('u8', copy=False)
elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:
# With repeated values, its MUCH faster to categorize object dtypes,
Expand Down