Merge pull request #168 from bjherger/fixed_strategy

devforfu · web-flow · commit 0024cf70d35b · 2018-08-23T19:40:57.000+05:00
diff --git a/README.rst b/README.rst
@@ -401,7 +401,7 @@ Example: imputing with a fixed value:
 
     >>> from sklearn_pandas import CategoricalImputer
     >>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
-    >>> imputer = CategoricalImputer(strategy='fixed_value', replacement='a')
+    >>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
     >>> imputer.fit_transform(data)
     array(['a', 'b', 'b', 'a'], dtype=object)
 
@@ -420,8 +420,8 @@ Unreleased
 ******************
 * Fix issues with unicode names in ``get_names`` (#160).
 * Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
-* Add ``strategy`` and ``replacement`` parameters to ``CategoricalImputer`` to allow imputing
-  with values other than the mode (#144).
+* Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
+  with values other than the mode (#144), (#161).
 * Preserve input data types when no transform is supplied (#138).
 
 1.6.0 (2017-10-28)
@@ -503,6 +503,7 @@ Other contributors:
 * Ariel Rossanigo (@arielrossanigo)
 * Arnau Gil Amat (@arnau126)
 * Assaf Ben-David (@AssafBenDavid)
+* Brendan Herger (@bjherger)
 * Cal Paterson (@calpaterson)
 * @defvorfu
 * Gustavo Sena Mafra (@gsmafra)
diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py
@@ -33,49 +33,46 @@ class CategoricalImputer(BaseEstimator, TransformerMixin):
     copy : boolean, optional (default=True)
         If True, a copy of X will be created.
 
-    strategy : string, optional (default = 'mode')
-        If set to 'mode', replace all instances of `missing_values`
-        with the modal value. Otherwise, replace with
-        the value specified via `replacement`.
+    strategy : string, optional (default = 'most_frequent')
+        The imputation strategy.
 
-    replacement : string, optional (default='?')
+        - If "most_frequent", then replace missing using the most frequent
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
+
+    fill_value : string, optional (default='?')
         The value that all instances of `missing_values` are replaced
-        with if `strategy` is not set to 'mode'. This is useful if
+        with if `strategy` is set to `constant`. This is useful if
         you don't want to impute with the mode, or if there are multiple
         modes in your data and you want to choose a particular one. If
-        `strategy` is set to `mode`, this parameter is ignored.
+        `strategy` is not set to `constant`, this parameter is ignored.
 
     Attributes
     ----------
     fill_ : str
-        Most frequent value of the training data.
+        The imputation fill value
 
     """
 
     def __init__(
         self,
         missing_values='NaN',
-        strategy='mode',
-        replacement=None,
+        strategy='most_frequent',
+        fill_value='?',
         copy=True
     ):
         self.missing_values = missing_values
         self.copy = copy
-        self.replacement = replacement
+        self.fill_value = fill_value
         self.strategy = strategy
 
-        strategies = ['fixed_value', 'mode']
+        strategies = ['constant', 'most_frequent']
         if self.strategy not in strategies:
             raise ValueError(
                 'Strategy {0} not in {1}'.format(self.strategy, strategies)
             )
 
-        if self.strategy == 'fixed_value' and self.replacement is None:
-            raise ValueError(
-                'Please specify a value for \'replacement\''
-                'when using the fixed_value strategy.'
-            )
-
     def fit(self, X, y=None):
         """
 
@@ -95,10 +92,10 @@ def fit(self, X, y=None):
 
         mask = _get_mask(X, self.missing_values)
         X = X[~mask]
-        if self.strategy == 'mode':
+        if self.strategy == 'most_frequent':
             modes = pd.Series(X).mode()
-        elif self.strategy == 'fixed_value':
-            modes = np.array([self.replacement])
+        elif self.strategy == 'constant':
+            modes = np.array([self.fill_value])
         if modes.shape[0] == 0:
             raise ValueError('Data is empty or all values are null')
         elif modes.shape[0] > 1:
diff --git a/tests/test_categorical_imputer.py b/tests/test_categorical_imputer.py
@@ -147,26 +147,34 @@ def test_custom_replacement(replacement_value, input_type):
     Xc = X.copy()
 
     Xt = CategoricalImputer(
-        strategy='fixed_value',
-        replacement=replacement_value
+        strategy='constant',
+        fill_value=replacement_value
     ).fit_transform(X)
 
     assert pd.core.common.array_equivalent(np.asarray(X), np.asarray(Xc))
     assert isinstance(Xt, np.ndarray)
     assert (Xt == ['a', replacement_value, 'b', 'b']).all()
 
 
-def test_missing_replacement():
-    """
-    Raise error if no replacement value specified and strategy='fixed_value'
-    """
-    with pytest.raises(ValueError):
-        CategoricalImputer(strategy="fixed_value")
-
-
 def test_invalid_strategy():
     """
     Raise an error if an invalid strategy is entered
     """
     with pytest.raises(ValueError):
         CategoricalImputer(strategy="not_a_supported_strategy")
+
+
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_default_fill_value_for_constant_strategy(input_type):
+    data = ['a', np.nan, 'b', 'b']
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data, dtype=object)
+
+    imputer = CategoricalImputer(strategy='constant')
+    Xt = imputer.fit_transform(X)
+
+    assert imputer.fill_ == '?'
+    assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()