-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API/ENH: union Categorical #13361
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
API/ENH: union Categorical #13361
Changes from 4 commits
ccaeb76
7b37c34
77e7963
4499cda
17209f9
568784f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,8 @@ | |
from pandas import (DataFrame, concat, | ||
read_csv, isnull, Series, date_range, | ||
Index, Panel, MultiIndex, Timestamp, | ||
DatetimeIndex) | ||
DatetimeIndex, Categorical) | ||
from pandas.types.concat import union_categoricals | ||
from pandas.util import testing as tm | ||
from pandas.util.testing import (assert_frame_equal, | ||
makeCustomDataframe as mkdf, | ||
|
@@ -919,6 +920,37 @@ def test_concat_keys_with_none(self): | |
keys=['b', 'c', 'd', 'e']) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_union_categorical(self): | ||
# GH 13361 | ||
s = Categorical(list('abc')) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a test that makes sure that the returned categories are in order of appearance in the union (you might have an existing one which checks this accidently, but make an explict one, and mark it) |
||
s2 = Categorical(list('abd')) | ||
result = union_categoricals([s, s2]) | ||
expected = Categorical(list('abcabd')) | ||
tm.assert_categorical_equal(result, expected, ignore_order=True) | ||
|
||
s = Categorical([0, 1, 2]) | ||
s2 = Categorical([2, 3, 4]) | ||
result = union_categoricals([s, s2]) | ||
expected = Categorical([0, 1, 2, 2, 3, 4]) | ||
tm.assert_categorical_equal(result, expected, ignore_order=True) | ||
|
||
s = Categorical([0, 1.2, 2]) | ||
s2 = Categorical([2, 3.4, 4]) | ||
result = union_categoricals([s, s2]) | ||
expected = Categorical([0, 1.2, 2, 2, 3.4, 4]) | ||
tm.assert_categorical_equal(result, expected, ignore_order=True) | ||
|
||
# can't be ordered | ||
s = Categorical([0, 1.2, 2], ordered=True) | ||
with tm.assertRaises(TypeError): | ||
union_categoricals([s, s2]) | ||
|
||
# must exactly match types | ||
s = Categorical([0, 1.2, 2]) | ||
s2 = Categorical([2, 3, 4]) | ||
with tm.assertRaises(TypeError): | ||
union_categoricals([s, s2]) | ||
|
||
def test_concat_bug_1719(self): | ||
ts1 = tm.makeTimeSeries() | ||
ts2 = tm.makeTimeSeries()[::2] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -201,6 +201,43 @@ def convert_categorical(x): | |
return Categorical(concatted, rawcats) | ||
|
||
|
||
def union_categoricals(to_union): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a if not ignore_order and any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals") It would still return a unordered cat, of course. |
||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded tag |
||
Combine list-like of Categoricals, unioning categories. All | ||
must have the same dtype, and none can be ordered. | ||
|
||
Parameters | ||
---------- | ||
to_union : list like of Categorical | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add Raises (and list when that happens) |
||
Returns | ||
------- | ||
Categorical | ||
A single array, categories will be ordered as they | ||
appear in the list | ||
""" | ||
from pandas import Index, Categorical, unique | ||
|
||
if any(c.ordered for c in to_union): | ||
raise TypeError("Can only combine unordered Categoricals") | ||
|
||
first = to_union[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should graceful catch the condition that the list of categoricals is empty. |
||
if not all(com.is_dtype_equal(c.categories, first.categories) | ||
for c in to_union): | ||
raise TypeError("dtype of categories must be the same") | ||
|
||
unique_cats = unique(np.concatenate([c.categories for c in to_union])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this really safe if That's why I thought we needed to use the Index.append() method. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh good point - I'll change that and add some tests. |
||
categories = Index(unique_cats) | ||
|
||
new_codes = [] | ||
for c in to_union: | ||
indexer = categories.get_indexer(c.categories) | ||
new_codes.append(indexer.take(c.codes)) | ||
codes = np.concatenate(new_codes) | ||
return Categorical(codes, categories=categories, ordered=False, | ||
fastpath=True) | ||
|
||
|
||
def _concat_datetime(to_concat, axis=0, typs=None): | ||
""" | ||
provide concatenation of an datetimelike array of arrays each of which is a | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -963,12 +963,17 @@ def assertNotIsInstance(obj, cls, msg=''): | |
|
||
|
||
def assert_categorical_equal(left, right, check_dtype=True, | ||
obj='Categorical'): | ||
obj='Categorical', ignore_order=False): | ||
assertIsInstance(left, pd.Categorical, '[Categorical] ') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-string |
||
assertIsInstance(right, pd.Categorical, '[Categorical] ') | ||
|
||
assert_index_equal(left.categories, right.categories, | ||
obj='{0}.categories'.format(obj)) | ||
if ignore_order: | ||
assert_index_equal(left.categories.sort_values(), | ||
right.categories.sort_values(), | ||
obj='{0}.categories'.format(obj)) | ||
else: | ||
assert_index_equal(left.categories, right.categories, | ||
obj='{0}.categories'.format(obj)) | ||
assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, | ||
obj='{0}.codes'.format(obj)) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
versionadded tag here