Skip to content

Commit 19e3d29

Browse files
committed
ENH: Categorical.unique can keep same dtype
1 parent 2863428 commit 19e3d29

File tree

4 files changed

+27
-64
lines changed

4 files changed

+27
-64
lines changed

doc/source/whatsnew/v1.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ Categorical
523523
- :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`)
524524
- Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`)
525525
- Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`)
526+
- Bug in :meth:`Categorical.unique` where dtype was changed, it there were unused categories (:issue:`xxxxx`).
526527
- Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`)
527528
-
528529

pandas/core/arrays/categorical.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2078,16 +2078,8 @@ def unique(self):
20782078
"""
20792079
# unlike np.unique, unique1d does not sort
20802080
unique_codes = unique1d(self.codes)
2081-
cat = self.copy()
2082-
2083-
# keep nan in codes
2084-
cat._codes = unique_codes
2085-
2086-
# exclude nan from indexer for categories
2087-
take_codes = unique_codes[unique_codes != -1]
2088-
if self.ordered:
2089-
take_codes = np.sort(take_codes)
2090-
return cat.set_categories(cat.categories.take(take_codes))
2081+
cat = self._constructor(unique_codes, dtype=self.dtype, fastpath=True)
2082+
return cat
20912083

20922084
def _values_for_factorize(self):
20932085
return self._ndarray, -1

pandas/core/groupby/categorical.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ def recode_for_groupby(
7272

7373
# sort=False should order groups in as-encountered order (GH-8868)
7474
cat = c.unique()
75+
# exclude nan from indexer for categories
76+
take_codes = cat.codes[cat.codes != -1]
77+
if cat.ordered:
78+
take_codes = np.sort(take_codes)
79+
cat = cat.set_categories(cat.categories.take(take_codes))
7580

7681
# But for groupby to work, all categories should be present,
7782
# including those missing from the data (GH-13179), which .unique()

pandas/tests/arrays/categorical/test_analytics.py

Lines changed: 19 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pandas.compat import PYPY
88

9-
from pandas import Categorical, Index, NaT, Series, date_range
9+
from pandas import Categorical, CategoricalDtype, Index, NaT, Series, date_range
1010
import pandas._testing as tm
1111
from pandas.api.types import is_scalar
1212

@@ -188,84 +188,49 @@ def test_searchsorted(self, ordered):
188188
with pytest.raises(KeyError, match="cucumber"):
189189
ser.searchsorted(["bread", "cucumber"])
190190

191-
def test_unique(self):
191+
def test_unique(self, ordered):
192+
# GHXXXXX
193+
dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
194+
192195
# categories are reordered based on value when ordered=False
193-
cat = Categorical(["a", "b"])
194-
exp = Index(["a", "b"])
196+
cat = Categorical(["a", "b", "c"], dtype=dtype)
195197
res = cat.unique()
196-
tm.assert_index_equal(res.categories, exp)
197198
tm.assert_categorical_equal(res, cat)
198199

199-
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
200+
cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
200201
res = cat.unique()
201-
tm.assert_index_equal(res.categories, exp)
202-
tm.assert_categorical_equal(res, Categorical(exp))
202+
tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
203203

204-
cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
205-
exp = Index(["c", "a", "b"])
204+
cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
206205
res = cat.unique()
207-
tm.assert_index_equal(res.categories, exp)
208-
exp_cat = Categorical(exp, categories=["c", "a", "b"])
206+
exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
209207
tm.assert_categorical_equal(res, exp_cat)
210208

211209
# nan must be removed
212-
cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
213-
res = cat.unique()
214-
exp = Index(["b", "a"])
215-
tm.assert_index_equal(res.categories, exp)
216-
exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
217-
tm.assert_categorical_equal(res, exp_cat)
218-
219-
def test_unique_ordered(self):
220-
# keep categories order when ordered=True
221-
cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True)
210+
cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
222211
res = cat.unique()
223-
exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
212+
exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
224213
tm.assert_categorical_equal(res, exp_cat)
225214

226-
cat = Categorical(
227-
["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True
228-
)
229-
res = cat.unique()
230-
exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True)
231-
tm.assert_categorical_equal(res, exp_cat)
232-
233-
cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True)
234-
res = cat.unique()
235-
exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
236-
tm.assert_categorical_equal(res, exp_cat)
215+
def test_unique_index_series(self, ordered):
216+
# GHXXXXX
217+
dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
237218

238-
cat = Categorical(
239-
["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True
240-
)
241-
res = cat.unique()
242-
exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
243-
tm.assert_categorical_equal(res, exp_cat)
244-
245-
def test_unique_index_series(self):
246-
c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
219+
c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
247220
# Categorical.unique sorts categories by appearance order
248221
# if ordered=False
249-
exp = Categorical([3, 1, 2], categories=[3, 1, 2])
222+
exp = Categorical([3, 1, 2], dtype=dtype)
250223
tm.assert_categorical_equal(c.unique(), exp)
251224

252225
tm.assert_index_equal(Index(c).unique(), Index(exp))
253226
tm.assert_categorical_equal(Series(c).unique(), exp)
254227

255-
c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
256-
exp = Categorical([1, 2], categories=[1, 2])
228+
c = Categorical([1, 1, 2, 2], dtype=dtype)
229+
exp = Categorical([1, 2], dtype=dtype)
257230
tm.assert_categorical_equal(c.unique(), exp)
258231
tm.assert_index_equal(Index(c).unique(), Index(exp))
259232
tm.assert_categorical_equal(Series(c).unique(), exp)
260233

261-
c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
262-
# Categorical.unique keeps categories order if ordered=True
263-
exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
264-
tm.assert_categorical_equal(c.unique(), exp)
265-
266-
tm.assert_index_equal(Index(c).unique(), Index(exp))
267-
tm.assert_categorical_equal(Series(c).unique(), exp)
268-
269234
def test_shift(self):
270235
# GH 9416
271236
cat = Categorical(["a", "b", "c", "d", "a"])

0 commit comments

Comments
 (0)