Add CategoricalImputer working with string columns

gsmafra · dukebody · commit 5c0b88d8e70a · 2017-04-08T20:26:19.000+02:00
It replaces the null-like values with the mode of the column and works with string-like columns (object dtype in pandas).
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 *.pyc
 .tox/
 build/
-dist/
+dist/
+.cache/
diff --git a/README.rst b/README.rst
@@ -8,6 +8,7 @@ In particular, it provides:
 
 1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
 2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
+3. A ``CategoricalImputer`` that replaces null-like values with the mode and works with string columns.
 
 Installation
 ------------
@@ -249,7 +250,7 @@ Working with sparse features
 The stacking of the sparse features is done without ever densifying them.
 
 Cross-Validation
-----------------
+****************
 
 Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
 
@@ -263,13 +264,31 @@ To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_s
 
 Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface as sklearn's function of the same name.
 
+``CategoricalImputer``
+**********************
+
+Since the ``scikit-learn``  ``Imputer`` transformer currently only works with
+numbers, ``sklearn-pandas`` provides an equivalent helper transformer that do
+work with strings, substituting null values with the most frequent value in
+that column.
+
+Example:
+
+    >>> from sklearn_pandas import CategoricalImputer
+    >>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
+    >>> imputer = CategoricalImputer()
+    >>> imputer.fit_transform(data)
+    array(['a', 'b', 'b', 'b'], dtype=object)
+
 
 Changelog
 ---------
 
 Development
 ***********
 * Capture output columns generated names in ``transformed_names_`` attribute (#78).
+* Add ``CategoricalImputer`` that replaces null-like values with the mode
+  for string-like columns.
 
 
 1.3.0 (2017-01-21)
@@ -324,6 +343,7 @@ Other contributors:
 
 * Arnau Gil Amat
 * Cal Paterson
+* Gustavo Sena Mafra
 * Israel Saeta Pérez
 * Jeremy Howard
 * Olivier Grisel
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -2,3 +2,4 @@
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
+from .categorical_imputer import CategoricalImputer
diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py
@@ -0,0 +1,73 @@
+"""
+
+Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data.
+
+"""
+
+import pandas as pd
+import numpy as np
+
+from sklearn.base import TransformerMixin
+
+
+class CategoricalImputer(TransformerMixin):
+
+    """
+
+    Attributes
+    ----------
+
+    fill : str
+        Most frequent value of the training data.
+
+    """
+
+    def __init__(self):
+
+        self.fill = None
+
+    def fit(self, X):
+
+        """
+
+        Get the most frequent value.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Training data.
+
+        Returns
+        -------
+        CategoricalImputer
+            Itself.
+
+        """
+
+        self.fill = pd.Series(X).mode().values[0]
+
+        return self
+
+    def transform(self, X):
+
+        """
+
+        Replaces null values in the input data with the most frequent value of the training data.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Data with values to be imputed.
+
+        Returns
+        -------
+            np.ndarray
+                Data with imputed values.
+
+        """
+
+        X = X.copy()
+
+        X[pd.isnull(X)] = self.fill
+
+        return np.asarray(X)
diff --git a/tests/test_categorical_imputer.py b/tests/test_categorical_imputer.py
@@ -0,0 +1,52 @@
+import pytest
+
+import numpy as np
+import pandas as pd
+
+from sklearn_pandas import CategoricalImputer
+from sklearn_pandas import DataFrameMapper
+
+
+@pytest.mark.parametrize('none_value', [None, np.nan])
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_unit(input_type, none_value):
+
+    data = ['a', 'b', 'b', none_value]
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data)
+
+    Xc = X.copy()
+
+    Xt = CategoricalImputer().fit_transform(X)
+
+    assert (np.asarray(X) == np.asarray(Xc)).all()
+    assert type(Xt) == np.ndarray
+    assert len(X) == len(Xt)
+    assert len(Xt[pd.isnull(Xt)]) == 0
+
+
+@pytest.mark.parametrize('none_value', [None, np.nan])
+def test_integration(none_value):
+
+    df = pd.DataFrame({'cat': ['a', 'a', 'a', none_value, 'b'],
+                       'num': [1, 2, 3, 4, 5]})
+
+    mapper = DataFrameMapper([
+        ('cat', CategoricalImputer()),
+        ('num', None)
+    ], df_out=True).fit(df)
+
+    df_t = mapper.transform(df)
+
+    assert pd.notnull(df_t).all().all()
+
+    val_idx = pd.notnull(df['cat'])
+    nan_idx = ~val_idx
+
+    assert (df['num'] == df_t['num']).all()
+
+    assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
+    assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()

-Original file line number
+Diff line change
 *.pyc
 .tox/
 build/
 -dist/
 +dist/
 +.cache/
Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@`
`2`	`2`
`3`	`3`	`from .dataframe_mapper import DataFrameMapper # NOQA`
`4`	`4`	`from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA`
	`5`	`+from .categorical_imputer import CategoricalImputer`