Skip to content

Commit 5323cf8

Browse files
committed
BUG: bug in selecting from a Categorical with iloc (GH8623)
BUG: bug in groupby-transform with a Categorical (GH8623) BUG: bug in duplicated/drop_duplicates with a Categorical (GH8623)
1 parent 3bf4bfd commit 5323cf8

File tree

5 files changed

+79
-10
lines changed

5 files changed

+79
-10
lines changed

doc/source/whatsnew/v0.15.1.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,9 @@ Bug Fixes
183183
- Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`)
184184
- Bug in writing Categorical columns to an SQL database with ``to_sql`` (:issue:`8624`).
185185
- Bug in comparing ``Categorical`` of datetime raising when being compared to a scalar datetime (:issue:`8687`)
186-
186+
- Bug in selecting from a ``Categorical`` with ``.iloc`` (:issue:`8623`)
187+
- Bug in groupby-transform with a Categorical (:issue:`8623`)
188+
- Bug in duplicated/drop_duplicates with a Categorical (:issue:`8623`)
187189

188190

189191

pandas/core/frame.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2732,19 +2732,20 @@ def _m8_to_i8(x):
27322732
return x.view(np.int64)
27332733
return x
27342734

2735+
# if we are only duplicating on Categoricals this can be much faster
27352736
if subset is None:
2736-
values = list(_m8_to_i8(self.values.T))
2737+
values = list(_m8_to_i8(self.get_values().T))
27372738
else:
27382739
if np.iterable(subset) and not isinstance(subset, compat.string_types):
27392740
if isinstance(subset, tuple):
27402741
if subset in self.columns:
2741-
values = [self[subset].values]
2742+
values = [self[subset].get_values()]
27422743
else:
2743-
values = [_m8_to_i8(self[x].values) for x in subset]
2744+
values = [_m8_to_i8(self[x].get_values()) for x in subset]
27442745
else:
2745-
values = [_m8_to_i8(self[x].values) for x in subset]
2746+
values = [_m8_to_i8(self[x].get_values()) for x in subset]
27462747
else:
2747-
values = [self[subset].values]
2748+
values = [self[subset].get_values()]
27482749

27492750
keys = lib.fast_zip_fillna(values)
27502751
duplicated = lib.duplicated(keys, take_last=take_last)

pandas/core/internals.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class Block(PandasObject):
5858
_verify_integrity = True
5959
_validate_ndim = True
6060
_ftype = 'dense'
61+
_holder = None
6162

6263
def __init__(self, values, placement, ndim=None, fastpath=False):
6364
if ndim is None:
@@ -476,6 +477,14 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs):
476477

477478
def _concat_blocks(self, blocks, values):
478479
""" return the block concatenation """
480+
481+
# dispatch to a categorical to handle the concat
482+
if self._holder is None:
483+
484+
for b in blocks:
485+
if b.is_categorical:
486+
return b._concat_blocks(blocks,values)
487+
479488
return self._holder(values[0])
480489

481490
# block actions ####
@@ -1739,10 +1748,24 @@ def _concat_blocks(self, blocks, values):
17391748
return the block concatenation
17401749
"""
17411750

1742-
categories = self.values.categories
1743-
for b in blocks:
1751+
# we could have object blocks and categorical's here
1752+
# if we only have a single cateogoricals then combine everything
1753+
# else its a non-compat categorical
1754+
1755+
categoricals = [ b for b in blocks if b.is_categorical ]
1756+
objects = [ b for b in blocks if not b.is_categorical and b.is_object ]
1757+
1758+
# convert everything to object and call it a day
1759+
if len(objects) + len(categoricals) != len(blocks):
1760+
raise ValueError("try to combine non-object blocks and categoricals")
1761+
1762+
# validate the categories
1763+
categories = None
1764+
for b in categoricals:
1765+
if categories is None:
1766+
categories = b.values.categories
17441767
if not categories.equals(b.values.categories):
1745-
raise ValueError("incompatible levels in categorical block merge")
1768+
raise ValueError("incompatible categories in categorical block merge")
17461769

17471770
return self._holder(values[0], categories=categories)
17481771

pandas/core/series.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,13 @@ def _ixs(self, i, axis=0):
475475
value : scalar (int) or Series (slice, sequence)
476476
"""
477477
try:
478-
return _index.get_value_at(self.values, i)
478+
479+
# dispatch to the values if we need
480+
values = self.values
481+
if isinstance(values, np.ndarray):
482+
return _index.get_value_at(values, i)
483+
else:
484+
return values[i]
479485
except IndexError:
480486
raise
481487
except:

pandas/tests/test_categorical.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,21 @@ def test_basic(self):
10301030
str(df.values)
10311031
str(df)
10321032

1033+
# GH8623
1034+
x = pd.DataFrame([[1,'John P. Doe'],[2,'Jane Dove'],[1,'John P. Doe']],
1035+
columns=['person_id','person_name'])
1036+
x['person_name'] = pd.Categorical(x.person_name) # doing this breaks transform
1037+
1038+
expected = x.iloc[0].person_name
1039+
result = x.person_name.iloc[0]
1040+
self.assertEqual(result,expected)
1041+
1042+
result = x.person_name[0]
1043+
self.assertEqual(result,expected)
1044+
1045+
result = x.person_name.loc[0]
1046+
self.assertEqual(result,expected)
1047+
10331048
def test_creation_astype(self):
10341049
l = ["a","b","c","a"]
10351050
s = pd.Series(l)
@@ -1477,6 +1492,28 @@ def test_groupby(self):
14771492
result = gb.sum()
14781493
tm.assert_frame_equal(result, expected)
14791494

1495+
# GH 8623
1496+
x=pd.DataFrame([[1,'John P. Doe'],[2,'Jane Dove'],[1,'John P. Doe']],
1497+
columns=['person_id','person_name'])
1498+
x['person_name'] = pd.Categorical(x.person_name)
1499+
1500+
g = x.groupby(['person_id'])
1501+
result = g.transform(lambda x:x)
1502+
tm.assert_frame_equal(result, x[['person_name']])
1503+
1504+
result = x.drop_duplicates('person_name')
1505+
expected = x.iloc[[0,1]]
1506+
tm.assert_frame_equal(result, expected)
1507+
1508+
def f(x):
1509+
return x.drop_duplicates('person_name').iloc[0]
1510+
1511+
result = g.apply(f)
1512+
expected = x.iloc[[0,1]].copy()
1513+
expected.index = Index([1,2],name='person_id')
1514+
expected['person_name'] = expected['person_name'].astype('object')
1515+
tm.assert_frame_equal(result, expected)
1516+
14801517
def test_pivot_table(self):
14811518

14821519
raw_cat1 = Categorical(["a","a","b","b"], categories=["a","b","z"])

0 commit comments

Comments
 (0)