Skip to content

Commit 222b106

Browse files
committed
ENH: Implement DataFrame.astype('category')
1 parent ca737ac commit 222b106

File tree

4 files changed

+161
-36
lines changed

4 files changed

+161
-36
lines changed

doc/source/categorical.rst

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,16 @@ The categorical data type is useful in the following cases:
4545

4646
See also the :ref:`API docs on categoricals<api.categorical>`.
4747

48+
.. _categorical.objectcreation:
49+
4850
Object Creation
4951
---------------
5052

53+
.. _categorical.objectcreation.series:
54+
55+
Creating categories from a ``Series``
56+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
57+
5158
Categorical `Series` or columns in a `DataFrame` can be created in several ways:
5259

5360
By specifying ``dtype="category"`` when constructing a `Series`:
@@ -143,6 +150,55 @@ constructor to save the factorize step during normal constructor mode:
143150
splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
144151
s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
145152
153+
.. _categorical.objectcreation.frame:
154+
155+
Creating categories from a ``DataFrame``
156+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
157+
158+
.. versionadded:: 0.22.0
159+
160+
:meth:`DataFrame.astype` supports simultaneously setting multiple columns as categorical. When setting multiple
161+
columns as categorical, by default each column's dtype will contain categories for all labels present in all columns, even
162+
if a column does not contain all labels:
163+
164+
.. ipython:: python
165+
166+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
167+
df = df.astype('category')
168+
df
169+
df['A'].dtype
170+
df['B'].dtype
171+
172+
Note that this behavior is different than instantiating a ``DataFrame`` with categorical dtype, which will only assign
173+
categories to each column based on the labels present in each column:
174+
175+
.. ipython:: python
176+
177+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
178+
df['A'].dtype
179+
df['B'].dtype
180+
181+
When using ``astype``, you can control the categories that will be present in each column by passing
182+
a ``CategoricalDtype``:
183+
184+
.. ipython:: python
185+
186+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
187+
dtype = CategoricalDtype(categories=list('abdef'), ordered=True)
188+
df = df.astype(dtype)
189+
df
190+
df['A'].dtype
191+
df['B'].dtype
192+
193+
Use subselection if you only want to convert certain columns to categorical. The same be behaviors previously
194+
discussed hold with subselection.
195+
196+
.. ipython:: python
197+
198+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e'], 'C': ['x', 'y', 'z']})
199+
df[['A', 'B']] = df[['A', 'B']].astype('category')
200+
df.dtypes
201+
146202
.. _categorical.categoricaldtype:
147203

148204
CategoricalDtype

doc/source/whatsnew/v0.22.0.txt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,25 @@ version.
1313
New features
1414
~~~~~~~~~~~~
1515

16-
-
17-
-
18-
-
16+
.. _whatsnew_0220.enhancements.astype_category:
17+
18+
``DataFrame.astype`` now supports categoricals
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
:meth:`DataFrame.astype` now supports simultaneously setting multiple columns as categorical (:issue:`12860`)
22+
23+
When setting multiple columns as categorical, by default each column's dtype will contain categories for all
24+
labels present in all columns, even if a column does not contain all labels:
25+
26+
.. ipython:: python
27+
28+
df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
29+
df = df.astype('category')
30+
df
31+
df['A'].dtype
32+
df['B'].dtype
33+
34+
See the :ref:`categorical.objectcreation.frame` section of the documentation for more details and examples.
1935

2036
.. _whatsnew_0220.enhancements.other:
2137

pandas/core/generic.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@
1818
is_number,
1919
is_integer, is_bool,
2020
is_bool_dtype,
21+
is_categorical_dtype,
2122
is_numeric_dtype,
2223
is_datetime64_dtype,
2324
is_timedelta64_dtype,
2425
is_datetime64tz_dtype,
2526
is_list_like,
2627
is_dict_like,
2728
is_re_compilable,
28-
pandas_dtype)
29+
pandas_dtype,
30+
CategoricalDtype)
2931
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
3032
from pandas.core.dtypes.missing import isna, notna
3133
from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
@@ -3973,14 +3975,30 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
39733975
if col_name not in self:
39743976
raise KeyError('Only a column name can be used for the '
39753977
'key in a dtype mappings argument.')
3976-
from pandas import concat
39773978
results = []
39783979
for col_name, col in self.iteritems():
39793980
if col_name in dtype:
39803981
results.append(col.astype(dtype[col_name], copy=copy))
39813982
else:
39823983
results.append(results.append(col.copy() if copy else col))
3983-
return concat(results, axis=1, copy=False)
3984+
return pd.concat(results, axis=1, copy=False)
3985+
3986+
elif is_categorical_dtype(dtype) and self.ndim > 1:
3987+
# GH 12860
3988+
dtype_with_cat = (isinstance(dtype, CategoricalDtype) and
3989+
dtype.categories is not None)
3990+
if not dtype_with_cat:
3991+
categories = kwargs.get('categories', None)
3992+
ordered = (kwargs.get('ordered', None) or
3993+
getattr(dtype, 'ordered', None))
3994+
3995+
if categories is None:
3996+
categories = algos.unique(self.values.ravel(order='F'))
3997+
3998+
dtype = CategoricalDtype(categories, ordered)
3999+
4000+
results = (self[col].astype(dtype, copy=copy) for col in self)
4001+
return pd.concat(results, axis=1, copy=False)
39844002

39854003
# else, only a single dtype is given
39864004
new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,

pandas/tests/test_categorical.py

Lines changed: 65 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,51 +2176,86 @@ def test_basic(self):
21762176
result = x.person_name.loc[0]
21772177
assert result == expected
21782178

2179-
def test_creation_astype(self):
2180-
l = ["a", "b", "c", "a"]
2181-
s = pd.Series(l)
2182-
exp = pd.Series(Categorical(l))
2183-
res = s.astype('category')
2179+
def test_series_creation_astype(self):
2180+
labels = list('abca')
2181+
exp = Series(Categorical(labels))
2182+
res = Series(labels).astype('category')
21842183
tm.assert_series_equal(res, exp)
21852184

2186-
l = [1, 2, 3, 1]
2187-
s = pd.Series(l)
2188-
exp = pd.Series(Categorical(l))
2189-
res = s.astype('category')
2185+
labels = [1, 2, 3, 1]
2186+
exp = Series(Categorical(labels))
2187+
res = Series(labels).astype('category')
21902188
tm.assert_series_equal(res, exp)
21912189

2192-
df = pd.DataFrame({"cats": [1, 2, 3, 4, 5, 6],
2193-
"vals": [1, 2, 3, 4, 5, 6]})
2194-
cats = Categorical([1, 2, 3, 4, 5, 6])
2195-
exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
2196-
df["cats"] = df["cats"].astype("category")
2197-
tm.assert_frame_equal(exp_df, df)
2190+
labels_int = [1, 2, 3, 4, 5, 6]
2191+
exp = DataFrame({"cats": Categorical(labels_int), "vals": labels_int})
2192+
res = DataFrame({"cats": labels_int, "vals": labels_int})
2193+
res["cats"] = res["cats"].astype("category")
2194+
tm.assert_frame_equal(res, exp)
21982195

2199-
df = pd.DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
2200-
"vals": [1, 2, 3, 4, 5, 6]})
2201-
cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
2202-
exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
2203-
df["cats"] = df["cats"].astype("category")
2204-
tm.assert_frame_equal(exp_df, df)
2196+
labels_str = list('abbaad')
2197+
exp = DataFrame({"cats": Categorical(labels_str), "vals": labels_int})
2198+
res = DataFrame({"cats": labels_str, "vals": labels_int})
2199+
res["cats"] = res["cats"].astype("category")
2200+
tm.assert_frame_equal(res, exp)
22052201

22062202
# with keywords
2207-
l = ["a", "b", "c", "a"]
2208-
s = pd.Series(l)
2209-
exp = pd.Series(Categorical(l, ordered=True))
2203+
labels = list('abca')
2204+
s = Series(labels)
2205+
exp = Series(Categorical(labels, ordered=True))
22102206
res = s.astype(CategoricalDtype(None, ordered=True))
22112207
tm.assert_series_equal(res, exp)
22122208

2213-
exp = pd.Series(Categorical(
2214-
l, categories=list('abcdef'), ordered=True))
2215-
res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
2209+
cats = list('abcdef')
2210+
exp = Series(Categorical(labels, categories=cats, ordered=True))
2211+
res = s.astype(CategoricalDtype(cats, ordered=True))
22162212
tm.assert_series_equal(res, exp)
22172213

2214+
def test_frame_creation_astype(self):
2215+
# GH 12860
2216+
cats = list('abcde')
2217+
x = Categorical(list('abcd'), categories=cats)
2218+
y = Categorical(list('bcde'), categories=cats)
2219+
exp = DataFrame({'x': x, 'y': y})
2220+
2221+
data = {'x': list('abcd'), 'y': list('bcde')}
2222+
res = DataFrame(data).astype('category')
2223+
tm.assert_frame_equal(res, exp)
2224+
2225+
res = DataFrame(data).astype(CategoricalDtype())
2226+
tm.assert_frame_equal(res, exp)
2227+
2228+
# categories keyword
2229+
cats = list('abdef')
2230+
x = Categorical(['a', 'b', np.nan, 'd'], categories=cats)
2231+
y = Categorical(['b', np.nan, 'd', 'e'], categories=cats)
2232+
exp = DataFrame({'x': x, 'y': y})
2233+
2234+
res = DataFrame(data).astype('category', categories=cats)
2235+
tm.assert_frame_equal(res, exp)
2236+
2237+
res = DataFrame(data).astype(CategoricalDtype(categories=cats))
2238+
tm.assert_frame_equal(res, exp)
2239+
2240+
# ordered keyword
2241+
cats = [1, 2, 3, 4, 0]
2242+
x = Categorical(range(1, 5), categories=cats, ordered=True)
2243+
y = Categorical(range(4), categories=cats, ordered=True)
2244+
exp = DataFrame({'x': x, 'y': y})
2245+
2246+
data = {'x': range(1, 5), 'y': range(4)}
2247+
res = DataFrame(data).astype('category', ordered=True)
2248+
tm.assert_frame_equal(res, exp)
2249+
2250+
res = DataFrame(data).astype(CategoricalDtype(ordered=True))
2251+
tm.assert_frame_equal(res, exp)
2252+
22182253
@pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
22192254
def test_empty_astype(self, columns):
22202255
# GH 18004
2221-
msg = '> 1 ndim Categorical are not supported at this time'
2222-
with tm.assert_raises_regex(NotImplementedError, msg):
2223-
DataFrame(columns=columns).astype('category')
2256+
exp = DataFrame({c: Categorical([]) for c in columns}, index=[])
2257+
res = DataFrame(columns=columns).astype('category')
2258+
tm.assert_frame_equal(res, exp)
22242259

22252260
def test_construction_series(self):
22262261

0 commit comments

Comments
 (0)