Skip to content

Commit c5166b6

Browse files
topper-123jreback
authored andcommitted
REF: clearer Categorical/CategoricalIndex construction (#24419)
1 parent a8f97c1 commit c5166b6

File tree

6 files changed

+148
-76
lines changed

6 files changed

+148
-76
lines changed

pandas/core/arrays/categorical.py

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -316,50 +316,19 @@ class Categorical(ExtensionArray, PandasObject):
316316
def __init__(self, values, categories=None, ordered=None, dtype=None,
317317
fastpath=False):
318318

319-
# Ways of specifying the dtype (prioritized ordered)
320-
# 1. dtype is a CategoricalDtype
321-
# a.) with known categories, use dtype.categories
322-
# b.) else with Categorical values, use values.dtype
323-
# c.) else, infer from values
324-
# d.) specifying dtype=CategoricalDtype and categories is an error
325-
# 2. dtype is a string 'category'
326-
# a.) use categories, ordered
327-
# b.) use values.dtype
328-
# c.) infer from values
329-
# 3. dtype is None
330-
# a.) use categories, ordered
331-
# b.) use values.dtype
332-
# c.) infer from values
333-
if dtype is not None:
334-
# The dtype argument takes precedence over values.dtype (if any)
335-
if isinstance(dtype, compat.string_types):
336-
if dtype == 'category':
337-
dtype = CategoricalDtype(categories, ordered)
338-
else:
339-
msg = "Unknown `dtype` {dtype}"
340-
raise ValueError(msg.format(dtype=dtype))
341-
elif categories is not None or ordered is not None:
342-
raise ValueError("Cannot specify both `dtype` and `categories`"
343-
" or `ordered`.")
344-
elif is_categorical(values):
345-
# If no "dtype" was passed, use the one from "values", but honor
346-
# the "ordered" and "categories" arguments
347-
dtype = values.dtype._from_categorical_dtype(values.dtype,
348-
categories, ordered)
319+
dtype = CategoricalDtype._from_values_or_dtype(values, categories,
320+
ordered, dtype)
321+
# At this point, dtype is always a CategoricalDtype, but
322+
# we may have dtype.categories be None, and we need to
323+
# infer categories in a factorization step futher below
349324

325+
if is_categorical(values):
350326
# GH23814, for perf, if values._values already an instance of
351327
# Categorical, set values to codes, and run fastpath
352328
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
353329
isinstance(values._values, type(self))):
354330
values = values._values.codes.copy()
355331
fastpath = True
356-
else:
357-
# If dtype=None and values is not categorical, create a new dtype
358-
dtype = CategoricalDtype(categories, ordered)
359-
360-
# At this point, dtype is always a CategoricalDtype and you should not
361-
# use categories and ordered seperately.
362-
# if dtype.categories is None, we are inferring
363332

364333
if fastpath:
365334
self._codes = coerce_indexer_dtype(values, dtype.categories)
@@ -656,6 +625,9 @@ def from_codes(cls, codes, categories, ordered=False):
656625
categorical. If not given, the resulting categorical will be
657626
unordered.
658627
"""
628+
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
629+
ordered)
630+
659631
codes = np.asarray(codes) # #21767
660632
if not is_integer_dtype(codes):
661633
msg = "codes need to be array-like integers"
@@ -675,14 +647,12 @@ def from_codes(cls, codes, categories, ordered=False):
675647
raise ValueError(
676648
"codes need to be convertible to an arrays of integers")
677649

678-
categories = CategoricalDtype.validate_categories(categories)
679-
680-
if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
650+
if len(codes) and (
651+
codes.max() >= len(dtype.categories) or codes.min() < -1):
681652
raise ValueError("codes need to be between -1 and "
682653
"len(categories)-1")
683654

684-
return cls(codes, categories=categories, ordered=ordered,
685-
fastpath=True)
655+
return cls(codes, dtype=dtype, fastpath=True)
686656

687657
_codes = None
688658

pandas/core/dtypes/dtypes.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas import compat
1414

1515
from .base import ExtensionDtype, _DtypeOpsMixin
16+
from .inference import is_list_like
1617

1718

1819
def register_extension_dtype(cls):
@@ -240,6 +241,90 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None):
240241
ordered = dtype.ordered
241242
return cls(categories, ordered)
242243

244+
@classmethod
245+
def _from_values_or_dtype(cls, values=None, categories=None, ordered=None,
246+
dtype=None):
247+
"""
248+
Construct dtype from the input parameters used in :class:`Categorical`.
249+
250+
This constructor method specifically does not do the factorization
251+
step, if that is needed to find the categories. This constructor may
252+
therefore return ``CategoricalDtype(categories=None, ordered=None)``,
253+
which may not be useful. Additional steps may therefore have to be
254+
taken to create the final dtype.
255+
256+
The return dtype is specified from the inputs in this prioritized
257+
order:
258+
1. if dtype is a CategoricalDtype, return dtype
259+
2. if dtype is the string 'category', create a CategoricalDtype from
260+
the supplied categories and ordered parameters, and return that.
261+
3. if values is a categorical, use value.dtype, but override it with
262+
categories and ordered if either/both of those are not None.
263+
4. if dtype is None and values is not a categorical, construct the
264+
dtype from categories and ordered, even if either of those is None.
265+
266+
Parameters
267+
----------
268+
values : list-like, optional
269+
The list-like must be 1-dimensional.
270+
categories : list-like, optional
271+
Categories for the CategoricalDtype.
272+
ordered : bool, optional
273+
Designating if the categories are ordered.
274+
dtype : CategoricalDtype or the string "category", optional
275+
If ``CategoricalDtype``, cannot be used together with
276+
`categories` or `ordered`.
277+
278+
Returns
279+
-------
280+
CategoricalDtype
281+
282+
Examples
283+
--------
284+
>>> CategoricalDtype._from_values_or_dtype()
285+
CategoricalDtype(categories=None, ordered=None)
286+
>>> CategoricalDtype._from_values_or_dtype(categories=['a', 'b'],
287+
... ordered=True)
288+
CategoricalDtype(categories=['a', 'b'], ordered=True)
289+
>>> dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
290+
>>> dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
291+
>>> c = Categorical([0, 1], dtype=dtype1, fastpath=True)
292+
>>> CategoricalDtype._from_values_or_dtype(c, ['x', 'y'], ordered=True,
293+
... dtype=dtype2)
294+
ValueError: Cannot specify `categories` or `ordered` together with
295+
`dtype`.
296+
297+
The supplied dtype takes precedence over values' dtype:
298+
299+
>>> CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
300+
CategoricalDtype(['x', 'y'], ordered=False)
301+
"""
302+
from pandas.core.dtypes.common import is_categorical
303+
304+
if dtype is not None:
305+
# The dtype argument takes precedence over values.dtype (if any)
306+
if isinstance(dtype, compat.string_types):
307+
if dtype == 'category':
308+
dtype = CategoricalDtype(categories, ordered)
309+
else:
310+
msg = "Unknown dtype {dtype!r}"
311+
raise ValueError(msg.format(dtype=dtype))
312+
elif categories is not None or ordered is not None:
313+
raise ValueError("Cannot specify `categories` or `ordered` "
314+
"together with `dtype`.")
315+
elif is_categorical(values):
316+
# If no "dtype" was passed, use the one from "values", but honor
317+
# the "ordered" and "categories" arguments
318+
dtype = values.dtype._from_categorical_dtype(values.dtype,
319+
categories, ordered)
320+
else:
321+
# If dtype=None and values is not categorical, create a new dtype.
322+
# Note: This could potentially have categories=None and
323+
# ordered=None.
324+
dtype = CategoricalDtype(categories, ordered)
325+
326+
return dtype
327+
243328
def _finalize(self, categories, ordered, fastpath=False):
244329

245330
if ordered is not None:
@@ -408,7 +493,10 @@ def validate_categories(categories, fastpath=False):
408493
"""
409494
from pandas import Index
410495

411-
if not isinstance(categories, ABCIndexClass):
496+
if not fastpath and not is_list_like(categories):
497+
msg = "Parameter 'categories' must be list-like, was {!r}"
498+
raise TypeError(msg.format(categories))
499+
elif not isinstance(categories, ABCIndexClass):
412500
categories = Index(categories, tupleize_cols=False)
413501

414502
if not fastpath:

pandas/core/indexes/category.py

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -107,29 +107,23 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
107107
if fastpath:
108108
return cls._simple_new(data, name=name, dtype=dtype)
109109

110+
dtype = CategoricalDtype._from_values_or_dtype(data, categories,
111+
ordered, dtype)
112+
110113
if name is None and hasattr(data, 'name'):
111114
name = data.name
112115

113-
if isinstance(data, ABCCategorical):
114-
data = cls._create_categorical(data, categories, ordered,
115-
dtype)
116-
elif isinstance(data, CategoricalIndex):
117-
data = data._data
118-
data = cls._create_categorical(data, categories, ordered,
119-
dtype)
120-
else:
121-
116+
if not is_categorical_dtype(data):
122117
# don't allow scalars
123118
# if data is None, then categories must be provided
124119
if is_scalar(data):
125120
if data is not None or categories is None:
126121
cls._scalar_data_error(data)
127122
data = []
128-
data = cls._create_categorical(data, categories, ordered,
129-
dtype)
130123

131-
if copy:
132-
data = data.copy()
124+
data = cls._create_categorical(data, dtype=dtype)
125+
126+
data = data.copy() if copy else data
133127

134128
return cls._simple_new(data, name=name)
135129

@@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
159153
return CategoricalIndex(cat, name=name)
160154

161155
@classmethod
162-
def _create_categorical(cls, data, categories=None, ordered=None,
163-
dtype=None):
156+
def _create_categorical(cls, data, dtype=None):
164157
"""
165158
*this is an internal non-public method*
166159
@@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None,
169162
Parameters
170163
----------
171164
data : data for new Categorical
172-
categories : optional categories, defaults to existing
173-
ordered : optional ordered attribute, defaults to existing
174165
dtype : CategoricalDtype, defaults to existing
175166
176167
Returns
@@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None,
182173
data = data.values
183174

184175
if not isinstance(data, ABCCategorical):
185-
if ordered is None and dtype is None:
186-
ordered = False
187-
data = Categorical(data, categories=categories, ordered=ordered,
188-
dtype=dtype)
189-
else:
190-
if categories is not None:
191-
data = data.set_categories(categories, ordered=ordered)
192-
elif ordered is not None and ordered != data.ordered:
193-
data = data.set_ordered(ordered)
194-
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
195-
# we want to silently ignore dtype='category'
196-
data = data._set_dtype(dtype)
176+
return Categorical(data, dtype=dtype)
177+
178+
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
179+
# we want to silently ignore dtype='category'
180+
data = data._set_dtype(dtype)
197181
return data
198182

199183
@classmethod

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered):
368368
tm.assert_categorical_equal(result, expected)
369369

370370
def test_constructor_str_unknown(self):
371-
with pytest.raises(ValueError, match="Unknown `dtype`"):
371+
with pytest.raises(ValueError, match="Unknown dtype"):
372372
Categorical([1, 2], dtype="foo")
373373

374374
def test_constructor_from_categorical_with_dtype(self):

pandas/tests/dtypes/test_dtypes.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,40 @@ def test_construction_from_string(self):
9090
TypeError, lambda: CategoricalDtype.construct_from_string('foo'))
9191

9292
def test_constructor_invalid(self):
93-
msg = "CategoricalIndex.* must be called"
93+
msg = "Parameter 'categories' must be list-like"
9494
with pytest.raises(TypeError, match=msg):
9595
CategoricalDtype("category")
9696

97+
dtype1 = CategoricalDtype(['a', 'b'], ordered=True)
98+
dtype2 = CategoricalDtype(['x', 'y'], ordered=False)
99+
c = Categorical([0, 1], dtype=dtype1, fastpath=True)
100+
101+
@pytest.mark.parametrize('values, categories, ordered, dtype, expected',
102+
[
103+
[None, None, None, None,
104+
CategoricalDtype()],
105+
[None, ['a', 'b'], True, None, dtype1],
106+
[c, None, None, dtype2, dtype2],
107+
[c, ['x', 'y'], False, None, dtype2],
108+
])
109+
def test_from_values_or_dtype(
110+
self, values, categories, ordered, dtype, expected):
111+
result = CategoricalDtype._from_values_or_dtype(values, categories,
112+
ordered, dtype)
113+
assert result == expected
114+
115+
@pytest.mark.parametrize('values, categories, ordered, dtype', [
116+
[None, ['a', 'b'], True, dtype2],
117+
[None, ['a', 'b'], None, dtype2],
118+
[None, None, True, dtype2],
119+
])
120+
def test_from_values_or_dtype_raises(self, values, categories,
121+
ordered, dtype):
122+
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
123+
with pytest.raises(ValueError, match=msg):
124+
CategoricalDtype._from_values_or_dtype(values, categories,
125+
ordered, dtype)
126+
97127
def test_is_dtype(self):
98128
assert CategoricalDtype.is_dtype(self.dtype)
99129
assert CategoricalDtype.is_dtype('category')
@@ -706,7 +736,7 @@ def test_invalid_raises(self):
706736
with pytest.raises(TypeError, match='ordered'):
707737
CategoricalDtype(['a', 'b'], ordered='foo')
708738

709-
with pytest.raises(TypeError, match='collection'):
739+
with pytest.raises(TypeError, match="'categories' must be list-like"):
710740
CategoricalDtype('category')
711741

712742
def test_mixed(self):

pandas/tests/indexes/test_category.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
158158
tm.assert_index_equal(result, expected, exact=True)
159159

160160
# error when combining categories/ordered and dtype kwargs
161-
msg = 'Cannot specify both `dtype` and `categories` or `ordered`.'
161+
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
162162
with pytest.raises(ValueError, match=msg):
163163
CategoricalIndex(data, categories=cats, dtype=dtype)
164164

0 commit comments

Comments
 (0)