Skip to content

De-duplicate code for indexing with list-likes of keys #21503

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2723,7 +2723,8 @@ def _getitem_array(self, key):
indexer = key.nonzero()[0]
return self._take(indexer, axis=0)
else:
indexer = self.loc._convert_to_indexer(key, axis=1)
indexer = self.loc._convert_to_indexer(key, axis=1,
raise_missing=True)
return self._take(indexer, axis=1)

def _getitem_multilevel(self, key):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3627,7 +3627,7 @@ def _reindex_non_unique(self, target):
else:

# need to retake to have the same size as the indexer
indexer[~check] = 0
indexer[~check] = -1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what effect did this have? e.g. was this a bug before or just not used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To the best of my understanding, it was not used: not in the sense that this line was not hit, but because the locations for missing keys were then taken from new_indexer (see three lines below), where they were correctly marked with -1.


# reset the new indexer to account for the new size
new_indexer = np.arange(len(self.take(indexer)))
Expand Down
214 changes: 113 additions & 101 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,8 @@ def _align_series(self, indexer, ser, multiindex_indexer=False):
if isinstance(indexer, tuple):

# flatten np.ndarray indexers
ravel = lambda i: i.ravel() if isinstance(i, np.ndarray) else i
def ravel(i):
return i.ravel() if isinstance(i, np.ndarray) else i
indexer = tuple(map(ravel, indexer))

aligners = [not com.is_null_slice(idx) for idx in indexer]
Expand Down Expand Up @@ -925,33 +926,10 @@ def _multi_take(self, tup):
""" create the reindex map for our objects, raise the _exception if we
can't create the indexer
"""
try:
o = self.obj
d = {}
for key, axis in zip(tup, o._AXIS_ORDERS):
ax = o._get_axis(axis)
# Have the index compute an indexer or return None
# if it cannot handle:
indexer, keyarr = ax._convert_listlike_indexer(key,
kind=self.name)
# We only act on all found values:
if indexer is not None and (indexer != -1).all():
self._validate_read_indexer(key, indexer, axis)
d[axis] = (ax[indexer], indexer)
continue

# If we are trying to get actual keys from empty Series, we
# patiently wait for a KeyError later on - otherwise, convert
if len(ax) or not len(key):
key = self._convert_for_reindex(key, axis)
indexer = ax.get_indexer_for(key)
keyarr = ax.reindex(keyarr)[0]
self._validate_read_indexer(keyarr, indexer,
o._get_axis_number(axis))
d[axis] = (keyarr, indexer)
return o._reindex_with_indexers(d, copy=True, allow_dups=True)
except (KeyError, IndexingError) as detail:
raise self._exception(detail)
o = self.obj
d = {axis: self._get_listlike_indexer(key, axis)
for (key, axis) in zip(tup, o._AXIS_ORDERS)}
return o._reindex_with_indexers(d, copy=True, allow_dups=True)

def _convert_for_reindex(self, key, axis=None):
return key
Expand Down Expand Up @@ -1124,7 +1102,88 @@ def _getitem_axis(self, key, axis=None):

return self._get_label(key, axis=axis)

def _get_listlike_indexer(self, key, axis, raise_missing=False):
"""
Transform a list-like of keys into a new index and an indexer.

Parameters
----------
key : list-like
Target labels
axis: int
Dimension on which the indexing is being made
raise_missing: bool
Whether to raise a KeyError if some labels are not found. Will be
removed in the future, and then this method will always behave as
if raise_missing=True.

Raises
------
KeyError
If at least one key was requested but none was found, and
raise_missing=True.

Returns
-------
keyarr: Index
New index (coinciding with 'key' if the axis is unique)
values : array-like
An indexer for the return object; -1 denotes keys not found
"""
o = self.obj
ax = o._get_axis(axis)

# Have the index compute an indexer or return None
# if it cannot handle:
indexer, keyarr = ax._convert_listlike_indexer(key,
kind=self.name)
# We only act on all found values:
if indexer is not None and (indexer != -1).all():
self._validate_read_indexer(key, indexer, axis,
raise_missing=raise_missing)
return ax[indexer], indexer

if ax.is_unique:
# If we are trying to get actual keys from empty Series, we
# patiently wait for a KeyError later on - otherwise, convert
if len(ax) or not len(key):
key = self._convert_for_reindex(key, axis)
indexer = ax.get_indexer_for(key)
keyarr = ax.reindex(keyarr)[0]
else:
keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)

self._validate_read_indexer(keyarr, indexer,
o._get_axis_number(axis),
raise_missing=raise_missing)
return keyarr, indexer

def _getitem_iterable(self, key, axis=None):
"""
Index current object with an an iterable key (which can be a boolean
indexer, or a collection of keys).

Parameters
----------
key : iterable
Target labels, or boolean indexer
axis: int, default None
Dimension on which the indexing is being made

Raises
------
KeyError
If no key was found. Will change in the future to raise if not all
keys were found.
IndexingError
If the boolean indexer is unalignable with the object being
indexed.

Returns
-------
scalar, DataFrame, or Series: indexed value(s),
"""

if axis is None:
axis = self.axis or 0

Expand All @@ -1133,54 +1192,18 @@ def _getitem_iterable(self, key, axis=None):
labels = self.obj._get_axis(axis)

if com.is_bool_indexer(key):
# A boolean indexer
key = check_bool_indexer(labels, key)
inds, = key.nonzero()
return self.obj._take(inds, axis=axis)
else:
# Have the index compute an indexer or return None
# if it cannot handle; we only act on all found values
indexer, keyarr = labels._convert_listlike_indexer(
key, kind=self.name)
if indexer is not None and (indexer != -1).all():
self._validate_read_indexer(key, indexer, axis)
return self.obj.take(indexer, axis=axis)

ax = self.obj._get_axis(axis)
# existing labels are unique and indexer are unique
if labels.is_unique and Index(keyarr).is_unique:
indexer = ax.get_indexer_for(key)
self._validate_read_indexer(key, indexer, axis)

d = {axis: [ax.reindex(keyarr)[0], indexer]}
return self.obj._reindex_with_indexers(d, copy=True,
allow_dups=True)

# existing labels are non-unique
else:

# reindex with the specified axis
if axis + 1 > self.obj.ndim:
raise AssertionError("invalid indexing error with "
"non-unique index")

new_target, indexer, new_indexer = labels._reindex_non_unique(
keyarr)

if new_indexer is not None:
result = self.obj._take(indexer[indexer != -1], axis=axis)

self._validate_read_indexer(key, new_indexer, axis)
result = result._reindex_with_indexers(
{axis: [new_target, new_indexer]},
copy=True, allow_dups=True)
# A collection of keys
keyarr, indexer = self._get_listlike_indexer(key, axis,
raise_missing=False)
return self.obj._reindex_with_indexers({axis: [keyarr, indexer]},
copy=True, allow_dups=True)

else:
self._validate_read_indexer(key, indexer, axis)
result = self.obj._take(indexer, axis=axis)

return result

def _validate_read_indexer(self, key, indexer, axis):
def _validate_read_indexer(self, key, indexer, axis, raise_missing=False):
"""
Check that indexer can be used to return a result (e.g. at least one
element was found, unless the list of keys was actually empty).
Expand All @@ -1193,11 +1216,16 @@ def _validate_read_indexer(self, key, indexer, axis):
Indices corresponding to the key (with -1 indicating not found)
axis: int
Dimension on which the indexing is being made
raise_missing: bool
Whether to raise a KeyError if some labels are not found. Will be
removed in the future, and then this method will always behave as
if raise_missing=True.

Raises
------
KeyError
If at least one key was requested none was found.
If at least one key was requested but none was found, and
raise_missing=True.
"""

ax = self.obj._get_axis(axis)
Expand All @@ -1214,6 +1242,12 @@ def _validate_read_indexer(self, key, indexer, axis):
u"None of [{key}] are in the [{axis}]".format(
key=key, axis=self.obj._get_axis_name(axis)))

# We (temporarily) allow for some missing keys with .loc, except in
# some cases (e.g. setting) in which "raise_missing" will be False
if not(self.name == 'loc' and not raise_missing):
not_found = list(set(key) - set(ax))
raise KeyError("{} not in index".format(not_found))

# we skip the warning on Categorical/Interval
# as this check is actually done (check for
# non-missing values), but a bit later in the
Expand All @@ -1229,9 +1263,10 @@ def _validate_read_indexer(self, key, indexer, axis):

if not (ax.is_categorical() or ax.is_interval()):
warnings.warn(_missing_key_warning,
FutureWarning, stacklevel=5)
FutureWarning, stacklevel=6)

def _convert_to_indexer(self, obj, axis=None, is_setter=False):
def _convert_to_indexer(self, obj, axis=None, is_setter=False,
raise_missing=False):
"""
Convert indexing key into something we can use to do actual fancy
indexing on an ndarray
Expand Down Expand Up @@ -1310,33 +1345,10 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False):
inds, = obj.nonzero()
return inds
else:

# Have the index compute an indexer or return None
# if it cannot handle
indexer, objarr = labels._convert_listlike_indexer(
obj, kind=self.name)
if indexer is not None:
return indexer

# unique index
if labels.is_unique:
indexer = check = labels.get_indexer(objarr)

# non-unique (dups)
else:
(indexer,
missing) = labels.get_indexer_non_unique(objarr)
# 'indexer' has dupes, create 'check' using 'missing'
check = np.zeros(len(objarr), dtype=np.intp)
check[missing] = -1

mask = check == -1
if mask.any():
raise KeyError('{mask} not in index'
.format(mask=objarr[mask]))

return com._values_from_object(indexer)

# When setting, missing keys are not allowed, even with .loc:
kwargs = {'raise_missing': True if is_setter else
raise_missing}
return self._get_listlike_indexer(obj, axis, **kwargs)[1]
else:
try:
return labels.get_loc(obj)
Expand Down