Skip to content

Fixed strategy #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ Example: imputing with a fixed value:

>>> from sklearn_pandas import CategoricalImputer
>>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
>>> imputer = CategoricalImputer(strategy='fixed_value', replacement='a')
>>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
>>> imputer.fit_transform(data)
array(['a', 'b', 'b', 'a'], dtype=object)

Expand All @@ -413,8 +413,8 @@ Development
******************
* Fix issues with unicode names in ``get_names`` (#160).
* Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
* Add ``strategy`` and ``replacement`` parameters to ``CategoricalImputer`` to allow imputing
with values other than the mode (#144).
* Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
with values other than the mode (#144), (#161).
* Preserve input data types when no transform is supplied (#138).


Expand Down Expand Up @@ -497,6 +497,7 @@ Other contributors:
* Ariel Rossanigo (@arielrossanigo)
* Arnau Gil Amat (@arnau126)
* Assaf Ben-David (@AssafBenDavid)
* Brendan Herger (@bjherger)
* Cal Paterson (@calpaterson)
* @defvorfu
* Gustavo Sena Mafra (@gsmafra)
Expand Down
39 changes: 18 additions & 21 deletions sklearn_pandas/categorical_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,49 +33,46 @@ class CategoricalImputer(BaseEstimator, TransformerMixin):
copy : boolean, optional (default=True)
If True, a copy of X will be created.

strategy : string, optional (default = 'mode')
If set to 'mode', replace all instances of `missing_values`
with the modal value. Otherwise, replace with
the value specified via `replacement`.
strategy : string, optional (default = 'most_frequent')
The imputation strategy.

replacement : string, optional (default='?')
- If "most_frequent", then replace missing using the most frequent
value along each column. Can be used with strings or numeric data.
- If "constant", then replace missing values with fill_value. Can be
used with strings or numeric data.

fill_value : string, optional (default='?')
The value that all instances of `missing_values` are replaced
with if `strategy` is not set to 'mode'. This is useful if
with if `strategy` is set to `constant`. This is useful if
you don't want to impute with the mode, or if there are multiple
modes in your data and you want to choose a particular one. If
`strategy` is set to `mode`, this parameter is ignored.
`strategy` is not set to `constant`, this parameter is ignored.

Attributes
----------
fill_ : str
Most frequent value of the training data.
The imputation fill value

"""

def __init__(
self,
missing_values='NaN',
strategy='mode',
replacement=None,
strategy='most_frequent',
fill_value='?',
copy=True
):
self.missing_values = missing_values
self.copy = copy
self.replacement = replacement
self.fill_value = fill_value
self.strategy = strategy

strategies = ['fixed_value', 'mode']
strategies = ['constant', 'most_frequent']
if self.strategy not in strategies:
raise ValueError(
'Strategy {0} not in {1}'.format(self.strategy, strategies)
)

if self.strategy == 'fixed_value' and self.replacement is None:
raise ValueError(
'Please specify a value for \'replacement\''
'when using the fixed_value strategy.'
)

def fit(self, X, y=None):
"""

Expand All @@ -95,10 +92,10 @@ def fit(self, X, y=None):

mask = _get_mask(X, self.missing_values)
X = X[~mask]
if self.strategy == 'mode':
if self.strategy == 'most_frequent':
modes = pd.Series(X).mode()
elif self.strategy == 'fixed_value':
modes = np.array([self.replacement])
elif self.strategy == 'constant':
modes = np.array([self.fill_value])
if modes.shape[0] == 0:
raise ValueError('Data is empty or all values are null')
elif modes.shape[0] > 1:
Expand Down
28 changes: 18 additions & 10 deletions tests/test_categorical_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,26 +147,34 @@ def test_custom_replacement(replacement_value, input_type):
Xc = X.copy()

Xt = CategoricalImputer(
strategy='fixed_value',
replacement=replacement_value
strategy='constant',
fill_value=replacement_value
).fit_transform(X)

assert pd.core.common.array_equivalent(np.asarray(X), np.asarray(Xc))
assert isinstance(Xt, np.ndarray)
assert (Xt == ['a', replacement_value, 'b', 'b']).all()


def test_missing_replacement():
"""
Raise error if no replacement value specified and strategy='fixed_value'
"""
with pytest.raises(ValueError):
CategoricalImputer(strategy="fixed_value")


def test_invalid_strategy():
"""
Raise an error if an invalid strategy is entered
"""
with pytest.raises(ValueError):
CategoricalImputer(strategy="not_a_supported_strategy")


@pytest.mark.parametrize('input_type', ['np', 'pd'])
def test_default_fill_value_for_constant_strategy(input_type):
data = ['a', np.nan, 'b', 'b']

if input_type == 'pd':
X = pd.Series(data)
else:
X = np.asarray(data, dtype=object)

imputer = CategoricalImputer(strategy='constant')
Xt = imputer.fit_transform(X)

assert imputer.fill_ == '?'
assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()