Skip to content

Adding DataFrameImputer #82

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cache/v/cache/lastfailed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
2 changes: 1 addition & 1 deletion sklearn_pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__version__ = '1.3.0'

from .dataframe_mapper import DataFrameMapper # NOQA
from .dataframe_imputer import DataFrameImputer # NOQA
from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
from .categorical_imputer import CategoricalImputer
73 changes: 73 additions & 0 deletions sklearn_pandas/categorical_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""

Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data.

"""

import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin


class CategoricalImputer(TransformerMixin):

"""

Attributes
----------

fill : str
Most frequent value of the training data.

"""

def __init__(self):

self.fill = None

def fit(self, X):

"""

Get the most frequent value.

Parameters
----------
X : np.ndarray or pd.Series
Training data.

Returns
-------
CategoricalImputer
Itself.

"""

self.fill = pd.Series(X).mode().values[0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is implicitly assuming that there will be only one mode value. Can you raise an explicit exception if this is not true? Something like:

modes = pd.Series(X).mode()
if modes.shape[0] == 0:
    raise ValueError('No value is repeteated more than twice in the column')
elif modes.shape[0] > 1:
    raise ValueError('Column has multiple modes {}, can't select one to fill'.format(modes.tolist())
else:
    self.fill = modes[0]


return self

def transform(self, X):

"""

Replaces null values in the input data with the most frequent value of the training data.

Parameters
----------
X : np.ndarray or pd.Series
Data with values to be imputed.

Returns
-------
np.ndarray
Data with imputed values.

"""

X = X.copy()

X[pd.isnull(X)] = self.fill

return np.asarray(X)
60 changes: 0 additions & 60 deletions sklearn_pandas/dataframe_imputer.py

This file was deleted.

23 changes: 0 additions & 23 deletions tests/test_dataframe_imputer.py

This file was deleted.

51 changes: 51 additions & 0 deletions tests/test_string_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

import numpy as np
import pandas as pd

from sklearn_pandas import CategoricalImputer
from sklearn_pandas import DataFrameMapper


@pytest.mark.parametrize('none_value', [None, np.nan])
@pytest.mark.parametrize('input_type', ['np', 'pd'])
def test_unit(input_type, none_value):

data = ['a', 'b', 'b', none_value]

if input_type == 'pd':
X = pd.Series(data)
else:
X = np.asarray(data)

Xc = X.copy()

Xt = CategoricalImputer().fit_transform(X)

assert (np.asarray(X) == np.asarray(Xc)).all()
assert type(Xt) == np.ndarray
assert len(X) == len(Xt)
assert len(Xt[pd.isnull(Xt)]) == 0

@pytest.mark.parametrize('none_value', [None, np.nan])
def test_integration(none_value):

df = pd.DataFrame({'cat': ['a', 'a', 'a', none_value, 'b'],
'num': [1, 2, 3, 4, 5]})

mapper = DataFrameMapper([
('cat', CategoricalImputer()),
('num', None)
], df_out=True).fit(df)

df_t = mapper.transform(df)

assert pd.notnull(df_t).all().all()

val_idx = pd.notnull(df['cat'])
nan_idx = ~val_idx

assert (df['num'] == df_t['num']).all()

assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()