scikit-learn-contrib · gsmafra · Feb 19, 2017 · Feb 19, 2017 · Mar 4, 2017 · Mar 4, 2017
diff --git a/.cache/v/cache/lastfailed b/.cache/v/cache/lastfailed
@@ -0,0 +1 @@
+{}
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,5 +1,5 @@
 __version__ = '1.3.0'
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
-from .dataframe_imputer import DataFrameImputer  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
+from .categorical_imputer import CategoricalImputer
diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py
@@ -0,0 +1,73 @@
+"""
+
+Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data.
+
+"""
+
+import pandas as pd
+import numpy as np
+
+from sklearn.base import TransformerMixin
+
+
+class CategoricalImputer(TransformerMixin):
+
+    """
+
+    Attributes
+    ----------
+
+    fill : str
+        Most frequent value of the training data.
+
+    """
+
+    def __init__(self):
+
+        self.fill = None
+
+    def fit(self, X):
+
+        """
+
+        Get the most frequent value.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Training data.
+
+        Returns
+        -------
+        CategoricalImputer
+            Itself.
+
+        """
+
+        self.fill = pd.Series(X).mode().values[0]
+
+        return self
+
+    def transform(self, X):
+
+        """
+
+        Replaces null values in the input data with the most frequent value of the training data.
+
+        Parameters
+        ----------
+            X : np.ndarray or pd.Series
+                Data with values to be imputed.
+
+        Returns
+        -------
+            np.ndarray
+                Data with imputed values.
+
+        """
+
+        X = X.copy()
+
+        X[pd.isnull(X)] = self.fill
+
+        return np.asarray(X)
diff --git a/sklearn_pandas/dataframe_imputer.py b/sklearn_pandas/dataframe_imputer.py
diff --git a/tests/test_dataframe_imputer.py b/tests/test_dataframe_imputer.py
diff --git a/tests/test_string_imputer.py b/tests/test_string_imputer.py
@@ -0,0 +1,51 @@
+import pytest
+
+import numpy as np
+import pandas as pd
+
+from sklearn_pandas import CategoricalImputer
+from sklearn_pandas import DataFrameMapper
+
+
+@pytest.mark.parametrize('none_value', [None, np.nan])
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_unit(input_type, none_value):
+
+    data = ['a', 'b', 'b', none_value]
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data)
+
+    Xc = X.copy()
+
+    Xt = CategoricalImputer().fit_transform(X)
+
+    assert (np.asarray(X) == np.asarray(Xc)).all()
+    assert type(Xt) == np.ndarray
+    assert len(X) == len(Xt)
+    assert len(Xt[pd.isnull(Xt)]) == 0
+
+@pytest.mark.parametrize('none_value', [None, np.nan])
+def test_integration(none_value):
+
+    df = pd.DataFrame({'cat': ['a', 'a', 'a', none_value, 'b'],
+                       'num': [1, 2, 3, 4, 5]})
+
+    mapper = DataFrameMapper([
+        ('cat', CategoricalImputer()),
+        ('num', None)
+    ], df_out=True).fit(df)
+
+    df_t = mapper.transform(df)
+
+    assert pd.notnull(df_t).all().all()
+
+    val_idx = pd.notnull(df['cat'])
+    nan_idx = ~val_idx
+
+    assert (df['num'] == df_t['num']).all()
+
+    assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
+    assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()