ENH: Implement DataFrame.astype('category')

jschendel · jschendel · commit 222b106bc811 · 2017-11-09T18:35:11.000-07:00
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -45,9 +45,16 @@ The categorical data type is useful in the following cases:
 
 See also the :ref:`API docs on categoricals<api.categorical>`.
 
+.. _categorical.objectcreation:
+
 Object Creation
 ---------------
 
+.. _categorical.objectcreation.series:
+
+Creating categories from a ``Series``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 Categorical `Series` or columns in a `DataFrame` can be created in several ways:
 
 By specifying ``dtype="category"`` when constructing a `Series`:
@@ -143,6 +150,55 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
+.. _categorical.objectcreation.frame:
+
+Creating categories from a ``DataFrame``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. versionadded:: 0.22.0
+
+:meth:`DataFrame.astype` supports simultaneously setting multiple columns as categorical. When setting multiple
+columns as categorical, by default each column's dtype will contain categories for all labels present in all columns, even
+if a column does not contain all labels:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
+   df = df.astype('category')
+   df
+   df['A'].dtype
+   df['B'].dtype
+
+Note that this behavior is different than instantiating a ``DataFrame`` with categorical dtype, which will only assign
+categories to each column based on the labels present in each column:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']}, dtype='category')
+   df['A'].dtype
+   df['B'].dtype
+
+When using ``astype``, you can control the categories that will be present in each column by passing
+a ``CategoricalDtype``:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
+   dtype = CategoricalDtype(categories=list('abdef'), ordered=True)
+   df = df.astype(dtype)
+   df
+   df['A'].dtype
+   df['B'].dtype
+
+Use subselection if you only want to convert certain columns to categorical. The same be behaviors previously
+discussed hold with subselection.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e'], 'C': ['x', 'y', 'z']})
+   df[['A', 'B']] = df[['A', 'B']].astype('category')
+   df.dtypes
+
 .. _categorical.categoricaldtype:
 
 CategoricalDtype
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -13,9 +13,25 @@ version.
 New features
 ~~~~~~~~~~~~
 
--
--
--
+.. _whatsnew_0220.enhancements.astype_category:
+
+``DataFrame.astype`` now supports categoricals
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`DataFrame.astype` now supports simultaneously setting multiple columns as categorical (:issue:`12860`)
+
+When setting multiple columns as categorical, by default each column's dtype will contain categories for all
+labels present in all columns, even if a column does not contain all labels:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['c', 'd', 'e']})
+   df = df.astype('category')
+   df
+   df['A'].dtype
+   df['B'].dtype
+
+See the :ref:`categorical.objectcreation.frame` section of the documentation for more details and examples.
 
 .. _whatsnew_0220.enhancements.other:
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -18,14 +18,16 @@
     is_number,
     is_integer, is_bool,
     is_bool_dtype,
+    is_categorical_dtype,
     is_numeric_dtype,
     is_datetime64_dtype,
     is_timedelta64_dtype,
     is_datetime64tz_dtype,
     is_list_like,
     is_dict_like,
     is_re_compilable,
-    pandas_dtype)
+    pandas_dtype,
+    CategoricalDtype)
 from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
 from pandas.core.dtypes.missing import isna, notna
 from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
@@ -3973,14 +3975,30 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
                 if col_name not in self:
                     raise KeyError('Only a column name can be used for the '
                                    'key in a dtype mappings argument.')
-            from pandas import concat
             results = []
             for col_name, col in self.iteritems():
                 if col_name in dtype:
                     results.append(col.astype(dtype[col_name], copy=copy))
                 else:
                     results.append(results.append(col.copy() if copy else col))
-            return concat(results, axis=1, copy=False)
+            return pd.concat(results, axis=1, copy=False)
+
+        elif is_categorical_dtype(dtype) and self.ndim > 1:
+            # GH 12860
+            dtype_with_cat = (isinstance(dtype, CategoricalDtype) and
+                              dtype.categories is not None)
+            if not dtype_with_cat:
+                categories = kwargs.get('categories', None)
+                ordered = (kwargs.get('ordered', None) or
+                           getattr(dtype, 'ordered', None))
+
+                if categories is None:
+                    categories = algos.unique(self.values.ravel(order='F'))
+
+                dtype = CategoricalDtype(categories, ordered)
+
+            results = (self[col].astype(dtype, copy=copy) for col in self)
+            return pd.concat(results, axis=1, copy=False)
 
         # else, only a single dtype is given
         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -2176,51 +2176,86 @@ def test_basic(self):
         result = x.person_name.loc[0]
         assert result == expected
 
-    def test_creation_astype(self):
-        l = ["a", "b", "c", "a"]
-        s = pd.Series(l)
-        exp = pd.Series(Categorical(l))
-        res = s.astype('category')
+    def test_series_creation_astype(self):
+        labels = list('abca')
+        exp = Series(Categorical(labels))
+        res = Series(labels).astype('category')
         tm.assert_series_equal(res, exp)
 
-        l = [1, 2, 3, 1]
-        s = pd.Series(l)
-        exp = pd.Series(Categorical(l))
-        res = s.astype('category')
+        labels = [1, 2, 3, 1]
+        exp = Series(Categorical(labels))
+        res = Series(labels).astype('category')
         tm.assert_series_equal(res, exp)
 
-        df = pd.DataFrame({"cats": [1, 2, 3, 4, 5, 6],
-                           "vals": [1, 2, 3, 4, 5, 6]})
-        cats = Categorical([1, 2, 3, 4, 5, 6])
-        exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
-        df["cats"] = df["cats"].astype("category")
-        tm.assert_frame_equal(exp_df, df)
+        labels_int = [1, 2, 3, 4, 5, 6]
+        exp = DataFrame({"cats": Categorical(labels_int), "vals": labels_int})
+        res = DataFrame({"cats": labels_int, "vals": labels_int})
+        res["cats"] = res["cats"].astype("category")
+        tm.assert_frame_equal(res, exp)
 
-        df = pd.DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'],
-                           "vals": [1, 2, 3, 4, 5, 6]})
-        cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd'])
-        exp_df = pd.DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
-        df["cats"] = df["cats"].astype("category")
-        tm.assert_frame_equal(exp_df, df)
+        labels_str = list('abbaad')
+        exp = DataFrame({"cats": Categorical(labels_str), "vals": labels_int})
+        res = DataFrame({"cats": labels_str, "vals": labels_int})
+        res["cats"] = res["cats"].astype("category")
+        tm.assert_frame_equal(res, exp)
 
         # with keywords
-        l = ["a", "b", "c", "a"]
-        s = pd.Series(l)
-        exp = pd.Series(Categorical(l, ordered=True))
+        labels = list('abca')
+        s = Series(labels)
+        exp = Series(Categorical(labels, ordered=True))
         res = s.astype(CategoricalDtype(None, ordered=True))
         tm.assert_series_equal(res, exp)
 
-        exp = pd.Series(Categorical(
-            l, categories=list('abcdef'), ordered=True))
-        res = s.astype(CategoricalDtype(list('abcdef'), ordered=True))
+        cats = list('abcdef')
+        exp = Series(Categorical(labels, categories=cats, ordered=True))
+        res = s.astype(CategoricalDtype(cats, ordered=True))
         tm.assert_series_equal(res, exp)
 
+    def test_frame_creation_astype(self):
+        # GH 12860
+        cats = list('abcde')
+        x = Categorical(list('abcd'), categories=cats)
+        y = Categorical(list('bcde'), categories=cats)
+        exp = DataFrame({'x': x, 'y': y})
+
+        data = {'x': list('abcd'), 'y': list('bcde')}
+        res = DataFrame(data).astype('category')
+        tm.assert_frame_equal(res, exp)
+
+        res = DataFrame(data).astype(CategoricalDtype())
+        tm.assert_frame_equal(res, exp)
+
+        # categories keyword
+        cats = list('abdef')
+        x = Categorical(['a', 'b', np.nan, 'd'], categories=cats)
+        y = Categorical(['b', np.nan, 'd', 'e'], categories=cats)
+        exp = DataFrame({'x': x, 'y': y})
+
+        res = DataFrame(data).astype('category', categories=cats)
+        tm.assert_frame_equal(res, exp)
+
+        res = DataFrame(data).astype(CategoricalDtype(categories=cats))
+        tm.assert_frame_equal(res, exp)
+
+        # ordered keyword
+        cats = [1, 2, 3, 4, 0]
+        x = Categorical(range(1, 5), categories=cats, ordered=True)
+        y = Categorical(range(4), categories=cats, ordered=True)
+        exp = DataFrame({'x': x, 'y': y})
+
+        data = {'x': range(1, 5), 'y': range(4)}
+        res = DataFrame(data).astype('category', ordered=True)
+        tm.assert_frame_equal(res, exp)
+
+        res = DataFrame(data).astype(CategoricalDtype(ordered=True))
+        tm.assert_frame_equal(res, exp)
+
     @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
     def test_empty_astype(self, columns):
         # GH 18004
-        msg = '> 1 ndim Categorical are not supported at this time'
-        with tm.assert_raises_regex(NotImplementedError, msg):
-            DataFrame(columns=columns).astype('category')
+        exp = DataFrame({c: Categorical([]) for c in columns}, index=[])
+        res = DataFrame(columns=columns).astype('category')
+        tm.assert_frame_equal(res, exp)
 
     def test_construction_series(self):