Skip to content

Commit 0024cf7

Browse files
authored
Merge pull request #168 from bjherger/fixed_strategy
2 parents 36156e6 + fc7ef02 commit 0024cf7

File tree

3 files changed

+40
-34
lines changed

3 files changed

+40
-34
lines changed

README.rst

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ Example: imputing with a fixed value:
401401

402402
>>> from sklearn_pandas import CategoricalImputer
403403
>>> data = np.array(['a', 'b', 'b', np.nan], dtype=object)
404-
>>> imputer = CategoricalImputer(strategy='fixed_value', replacement='a')
404+
>>> imputer = CategoricalImputer(strategy='constant', fill_value='a')
405405
>>> imputer.fit_transform(data)
406406
array(['a', 'b', 'b', 'a'], dtype=object)
407407

@@ -420,8 +420,8 @@ Unreleased
420420
******************
421421
* Fix issues with unicode names in ``get_names`` (#160).
422422
* Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
423-
* Add ``strategy`` and ``replacement`` parameters to ``CategoricalImputer`` to allow imputing
424-
with values other than the mode (#144).
423+
* Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
424+
with values other than the mode (#144), (#161).
425425
* Preserve input data types when no transform is supplied (#138).
426426

427427
1.6.0 (2017-10-28)
@@ -503,6 +503,7 @@ Other contributors:
503503
* Ariel Rossanigo (@arielrossanigo)
504504
* Arnau Gil Amat (@arnau126)
505505
* Assaf Ben-David (@AssafBenDavid)
506+
* Brendan Herger (@bjherger)
506507
* Cal Paterson (@calpaterson)
507508
* @defvorfu
508509
* Gustavo Sena Mafra (@gsmafra)

sklearn_pandas/categorical_imputer.py

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,49 +33,46 @@ class CategoricalImputer(BaseEstimator, TransformerMixin):
3333
copy : boolean, optional (default=True)
3434
If True, a copy of X will be created.
3535
36-
strategy : string, optional (default = 'mode')
37-
If set to 'mode', replace all instances of `missing_values`
38-
with the modal value. Otherwise, replace with
39-
the value specified via `replacement`.
36+
strategy : string, optional (default = 'most_frequent')
37+
The imputation strategy.
4038
41-
replacement : string, optional (default='?')
39+
- If "most_frequent", then replace missing using the most frequent
40+
value along each column. Can be used with strings or numeric data.
41+
- If "constant", then replace missing values with fill_value. Can be
42+
used with strings or numeric data.
43+
44+
fill_value : string, optional (default='?')
4245
The value that all instances of `missing_values` are replaced
43-
with if `strategy` is not set to 'mode'. This is useful if
46+
with if `strategy` is set to `constant`. This is useful if
4447
you don't want to impute with the mode, or if there are multiple
4548
modes in your data and you want to choose a particular one. If
46-
`strategy` is set to `mode`, this parameter is ignored.
49+
`strategy` is not set to `constant`, this parameter is ignored.
4750
4851
Attributes
4952
----------
5053
fill_ : str
51-
Most frequent value of the training data.
54+
The imputation fill value
5255
5356
"""
5457

5558
def __init__(
5659
self,
5760
missing_values='NaN',
58-
strategy='mode',
59-
replacement=None,
61+
strategy='most_frequent',
62+
fill_value='?',
6063
copy=True
6164
):
6265
self.missing_values = missing_values
6366
self.copy = copy
64-
self.replacement = replacement
67+
self.fill_value = fill_value
6568
self.strategy = strategy
6669

67-
strategies = ['fixed_value', 'mode']
70+
strategies = ['constant', 'most_frequent']
6871
if self.strategy not in strategies:
6972
raise ValueError(
7073
'Strategy {0} not in {1}'.format(self.strategy, strategies)
7174
)
7275

73-
if self.strategy == 'fixed_value' and self.replacement is None:
74-
raise ValueError(
75-
'Please specify a value for \'replacement\''
76-
'when using the fixed_value strategy.'
77-
)
78-
7976
def fit(self, X, y=None):
8077
"""
8178
@@ -95,10 +92,10 @@ def fit(self, X, y=None):
9592

9693
mask = _get_mask(X, self.missing_values)
9794
X = X[~mask]
98-
if self.strategy == 'mode':
95+
if self.strategy == 'most_frequent':
9996
modes = pd.Series(X).mode()
100-
elif self.strategy == 'fixed_value':
101-
modes = np.array([self.replacement])
97+
elif self.strategy == 'constant':
98+
modes = np.array([self.fill_value])
10299
if modes.shape[0] == 0:
103100
raise ValueError('Data is empty or all values are null')
104101
elif modes.shape[0] > 1:

tests/test_categorical_imputer.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,26 +147,34 @@ def test_custom_replacement(replacement_value, input_type):
147147
Xc = X.copy()
148148

149149
Xt = CategoricalImputer(
150-
strategy='fixed_value',
151-
replacement=replacement_value
150+
strategy='constant',
151+
fill_value=replacement_value
152152
).fit_transform(X)
153153

154154
assert pd.core.common.array_equivalent(np.asarray(X), np.asarray(Xc))
155155
assert isinstance(Xt, np.ndarray)
156156
assert (Xt == ['a', replacement_value, 'b', 'b']).all()
157157

158158

159-
def test_missing_replacement():
160-
"""
161-
Raise error if no replacement value specified and strategy='fixed_value'
162-
"""
163-
with pytest.raises(ValueError):
164-
CategoricalImputer(strategy="fixed_value")
165-
166-
167159
def test_invalid_strategy():
168160
"""
169161
Raise an error if an invalid strategy is entered
170162
"""
171163
with pytest.raises(ValueError):
172164
CategoricalImputer(strategy="not_a_supported_strategy")
165+
166+
167+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
168+
def test_default_fill_value_for_constant_strategy(input_type):
169+
data = ['a', np.nan, 'b', 'b']
170+
171+
if input_type == 'pd':
172+
X = pd.Series(data)
173+
else:
174+
X = np.asarray(data, dtype=object)
175+
176+
imputer = CategoricalImputer(strategy='constant')
177+
Xt = imputer.fit_transform(X)
178+
179+
assert imputer.fill_ == '?'
180+
assert (Xt == ['a', imputer.fill_, 'b', 'b']).all()

0 commit comments

Comments
 (0)