Skip to content

Commit 8f65ded

Browse files
committed
REF: clearer construction of Categorical/CategoricalIndex
1 parent a7b4c65 commit 8f65ded

File tree

5 files changed

+84
-76
lines changed

5 files changed

+84
-76
lines changed

pandas/core/arrays/categorical.py

Lines changed: 66 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,62 @@ def contains(cat, key, container):
200200
return any(loc_ in container for loc_ in loc)
201201

202202

203+
def create_categorical_dtype(values, categories=None, ordered=None,
204+
dtype=None):
205+
"""
206+
Helper function to Construct/return a :class:`CategoricalDtype`.
207+
208+
Construct the CategoricalDtype from typical inputs to :class:`Categorical`.
209+
210+
Parameters
211+
----------
212+
values : array-like or Categorical, (1-dimensional), optional
213+
categories : list-like, optional
214+
categories for the CategoricalDtype
215+
ordered : bool, optional
216+
designating if the categories are ordered
217+
dtype : CategoricalDtype, optional
218+
Cannot be used in combination with `categories` or `ordered`.
219+
220+
Returns
221+
-------
222+
CategoricalDtype
223+
224+
Examples
225+
--------
226+
>>> create_categorical_dtype()
227+
CategoricalDtype(categories=None, ordered=None)
228+
>>> create_categorical_dtype(categories=['a', 'b'], ordered=True)
229+
CategoricalDtype(categories=['a', 'b'], ordered=True)
230+
>>> dtype = CategoricalDtype(['a', 'b'], ordered=True)
231+
>>> c = Categorical([0, 1], dtype=dtype, fastpath=True)
232+
>>> create_categorical_dtype(c, ['x', 'y'], True, dtype=dtype)
233+
CategoricalDtype(['a', 'b'], ordered=True)
234+
"""
235+
if dtype is not None:
236+
# The dtype argument takes precedence over values.dtype (if any)
237+
if isinstance(dtype, compat.string_types):
238+
if dtype == 'category':
239+
dtype = CategoricalDtype(categories, ordered)
240+
else:
241+
msg = "Unknown dtype {dtype!r}"
242+
raise ValueError(msg.format(dtype=dtype))
243+
elif categories is not None or ordered is not None:
244+
raise ValueError("Cannot specify `categories` or `ordered` "
245+
"together with `dtype`.")
246+
elif is_categorical(values):
247+
# If no "dtype" was passed, use the one from "values", but honor
248+
# the "ordered" and "categories" arguments
249+
dtype = values.dtype._from_categorical_dtype(values.dtype,
250+
categories, ordered)
251+
else:
252+
# If dtype=None and values is not categorical, create a new dtype.
253+
# Note: This could potentially have categories=None and ordered=None.
254+
dtype = CategoricalDtype(categories, ordered)
255+
256+
return dtype
257+
258+
203259
_codes_doc = """\
204260
The category codes of this categorical.
205261
@@ -316,50 +372,18 @@ class Categorical(ExtensionArray, PandasObject):
316372
def __init__(self, values, categories=None, ordered=None, dtype=None,
317373
fastpath=False):
318374

319-
# Ways of specifying the dtype (prioritized ordered)
320-
# 1. dtype is a CategoricalDtype
321-
# a.) with known categories, use dtype.categories
322-
# b.) else with Categorical values, use values.dtype
323-
# c.) else, infer from values
324-
# d.) specifying dtype=CategoricalDtype and categories is an error
325-
# 2. dtype is a string 'category'
326-
# a.) use categories, ordered
327-
# b.) use values.dtype
328-
# c.) infer from values
329-
# 3. dtype is None
330-
# a.) use categories, ordered
331-
# b.) use values.dtype
332-
# c.) infer from values
333-
if dtype is not None:
334-
# The dtype argument takes precedence over values.dtype (if any)
335-
if isinstance(dtype, compat.string_types):
336-
if dtype == 'category':
337-
dtype = CategoricalDtype(categories, ordered)
338-
else:
339-
msg = "Unknown `dtype` {dtype}"
340-
raise ValueError(msg.format(dtype=dtype))
341-
elif categories is not None or ordered is not None:
342-
raise ValueError("Cannot specify both `dtype` and `categories`"
343-
" or `ordered`.")
344-
elif is_categorical(values):
345-
# If no "dtype" was passed, use the one from "values", but honor
346-
# the "ordered" and "categories" arguments
347-
dtype = values.dtype._from_categorical_dtype(values.dtype,
348-
categories, ordered)
375+
dtype = create_categorical_dtype(values, categories, ordered, dtype)
376+
# At this point, dtype is always a CategoricalDtype, but
377+
# we may have dtype.categories be None, and we need to
378+
# infer categories in a factorization step futher below
349379

380+
if is_categorical(values):
350381
# GH23814, for perf, if values._values already an instance of
351382
# Categorical, set values to codes, and run fastpath
352383
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
353384
isinstance(values._values, type(self))):
354385
values = values._values.codes.copy()
355386
fastpath = True
356-
else:
357-
# If dtype=None and values is not categorical, create a new dtype
358-
dtype = CategoricalDtype(categories, ordered)
359-
360-
# At this point, dtype is always a CategoricalDtype and you should not
361-
# use categories and ordered seperately.
362-
# if dtype.categories is None, we are inferring
363387

364388
if fastpath:
365389
self._codes = coerce_indexer_dtype(values, dtype.categories)
@@ -656,6 +680,8 @@ def from_codes(cls, codes, categories, ordered=False):
656680
categorical. If not given, the resulting categorical will be
657681
unordered.
658682
"""
683+
dtype = create_categorical_dtype(codes, categories, ordered)
684+
659685
codes = np.asarray(codes) # #21767
660686
if not is_integer_dtype(codes):
661687
msg = "codes need to be array-like integers"
@@ -675,14 +701,12 @@ def from_codes(cls, codes, categories, ordered=False):
675701
raise ValueError(
676702
"codes need to be convertible to an arrays of integers")
677703

678-
categories = CategoricalDtype.validate_categories(categories)
679-
680-
if len(codes) and (codes.max() >= len(categories) or codes.min() < -1):
704+
if len(codes) and (
705+
codes.max() >= len(dtype.categories) or codes.min() < -1):
681706
raise ValueError("codes need to be between -1 and "
682707
"len(categories)-1")
683708

684-
return cls(codes, categories=categories, ordered=ordered,
685-
fastpath=True)
709+
return cls(codes, dtype=dtype, fastpath=True)
686710

687711
_codes = None
688712

pandas/core/indexes/category.py

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
from pandas.core import accessor
1919
from pandas.core.algorithms import take_1d
20-
from pandas.core.arrays.categorical import Categorical, contains
20+
from pandas.core.arrays.categorical import (
21+
Categorical, contains, create_categorical_dtype)
2122
import pandas.core.common as com
2223
from pandas.core.config import get_option
2324
import pandas.core.indexes.base as ibase
@@ -107,29 +108,22 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
107108
if fastpath:
108109
return cls._simple_new(data, name=name, dtype=dtype)
109110

111+
dtype = create_categorical_dtype(data, categories, ordered, dtype)
112+
110113
if name is None and hasattr(data, 'name'):
111114
name = data.name
112115

113-
if isinstance(data, ABCCategorical):
114-
data = cls._create_categorical(data, categories, ordered,
115-
dtype)
116-
elif isinstance(data, CategoricalIndex):
117-
data = data._data
118-
data = cls._create_categorical(data, categories, ordered,
119-
dtype)
120-
else:
121-
116+
if not is_categorical_dtype(data):
122117
# don't allow scalars
123118
# if data is None, then categories must be provided
124119
if is_scalar(data):
125120
if data is not None or categories is None:
126121
cls._scalar_data_error(data)
127122
data = []
128-
data = cls._create_categorical(data, categories, ordered,
129-
dtype)
130123

131-
if copy:
132-
data = data.copy()
124+
data = cls._create_categorical(data, dtype=dtype)
125+
126+
data = data.copy() if copy else data
133127

134128
return cls._simple_new(data, name=name)
135129

@@ -159,8 +153,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
159153
return CategoricalIndex(cat, name=name)
160154

161155
@classmethod
162-
def _create_categorical(cls, data, categories=None, ordered=None,
163-
dtype=None):
156+
def _create_categorical(cls, data, dtype=None):
164157
"""
165158
*this is an internal non-public method*
166159
@@ -169,8 +162,6 @@ def _create_categorical(cls, data, categories=None, ordered=None,
169162
Parameters
170163
----------
171164
data : data for new Categorical
172-
categories : optional categories, defaults to existing
173-
ordered : optional ordered attribute, defaults to existing
174165
dtype : CategoricalDtype, defaults to existing
175166
176167
Returns
@@ -182,18 +173,11 @@ def _create_categorical(cls, data, categories=None, ordered=None,
182173
data = data.values
183174

184175
if not isinstance(data, ABCCategorical):
185-
if ordered is None and dtype is None:
186-
ordered = False
187-
data = Categorical(data, categories=categories, ordered=ordered,
188-
dtype=dtype)
189-
else:
190-
if categories is not None:
191-
data = data.set_categories(categories, ordered=ordered)
192-
elif ordered is not None and ordered != data.ordered:
193-
data = data.set_ordered(ordered)
194-
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
195-
# we want to silently ignore dtype='category'
196-
data = data._set_dtype(dtype)
176+
return Categorical(data, dtype=dtype)
177+
178+
if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
179+
# we want to silently ignore dtype='category'
180+
data = data._set_dtype(dtype)
197181
return data
198182

199183
@classmethod

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ def test_constructor_str_category(self, categories, ordered):
368368
tm.assert_categorical_equal(result, expected)
369369

370370
def test_constructor_str_unknown(self):
371-
with pytest.raises(ValueError, match="Unknown `dtype`"):
371+
with pytest.raises(ValueError, match="Unknown dtype"):
372372
Categorical([1, 2], dtype="foo")
373373

374374
def test_constructor_from_categorical_with_dtype(self):

pandas/tests/dtypes/test_dtypes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def test_construction_from_string(self):
9494
TypeError, lambda: CategoricalDtype.construct_from_string('foo'))
9595

9696
def test_constructor_invalid(self):
97-
msg = "categories must be list-like"
97+
msg = "Parameter 'categories' must be list-like"
9898
with pytest.raises(TypeError, match=msg):
9999
CategoricalDtype("category")
100100

@@ -710,7 +710,7 @@ def test_invalid_raises(self):
710710
with pytest.raises(TypeError, match='ordered'):
711711
CategoricalDtype(['a', 'b'], ordered='foo')
712712

713-
with pytest.raises(TypeError, match='categories must be list-like'):
713+
with pytest.raises(TypeError, match="'categories' must be list-like"):
714714
CategoricalDtype('category')
715715

716716
def test_mixed(self):

pandas/tests/indexes/test_category.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def test_construction_with_categorical_dtype(self):
156156
tm.assert_index_equal(result, expected, exact=True)
157157

158158
# error when combining categories/ordered and dtype kwargs
159-
msg = 'Cannot specify both `dtype` and `categories` or `ordered`.'
159+
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
160160
with pytest.raises(ValueError, match=msg):
161161
CategoricalIndex(data, categories=cats, dtype=dtype)
162162

0 commit comments

Comments
 (0)