-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: GH12042 Add parameter drop_first
to get_dummies to get n-1 variables out of n levels.
#12092
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -411,6 +411,111 @@ def test_dataframe_dummies_with_categorical(self): | |
]] | ||
assert_frame_equal(result, expected) | ||
|
||
# GH12402 Add a new parameter `drop_first` to avoid collinearity | ||
def test_basic_drop_first(self): | ||
# Basic case | ||
s_list = list('abc') | ||
s_series = Series(s_list) | ||
s_series_index = Series(s_list, list('ABC')) | ||
|
||
expected = DataFrame({'b': {0: 0.0, | ||
1: 1.0, | ||
2: 0.0}, | ||
'c': {0: 0.0, | ||
1: 0.0, | ||
2: 1.0}}) | ||
|
||
result = get_dummies(s_list, sparse=self.sparse, drop_first=True) | ||
assert_frame_equal(result, expected) | ||
|
||
result = get_dummies(s_series, sparse=self.sparse, drop_first=True) | ||
assert_frame_equal(result, expected) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we have the empty case tested somewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean the case that we only have 1 level in a categorical variable? I will add a case to test this - the result should be empty. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep. also test a completely empty frame as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback By "empty frame", do you mean this case In [1]: import pandas as pd
In [2]: pd.get_dummies(pd.Series())
Out[2]:
Empty DataFrame
Columns: []
Index: []
In [3]: pd.get_dummies(pd.Series(), drop_first=True)
Out[3]:
Empty DataFrame
Columns: []
Index: [] Or this case In [4]: pd.get_dummies(pd.DataFrame()) But this will raise an error ---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-6-059be6e25969> in <module>()
----> 1 pd.get_dummies(pd.DataFrame())
C:\D\Projects\Github\pandas\pandas\core\reshape.py in get_dummies(data, prefix,
prefix_sep, dummy_na, columns, sparse, drop_first)
1083 drop_first=drop_first)
1084 with_dummies.append(dummy)
-> 1085 result = concat(with_dummies, axis=1)
1086 else:
1087 result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
C:\D\Projects\Github\pandas\pandas\tools\merge.py in concat(objs, axis, join, jo
in_axes, ignore_index, keys, levels, names, verify_integrity, copy)
833 keys=keys, levels=levels, names=names,
834 verify_integrity=verify_integrity,
--> 835 copy=copy)
836 return op.get_result()
837
C:\D\Projects\Github\pandas\pandas\tools\merge.py in __init__(self, objs, axis,
join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
866
867 if len(objs) == 0:
--> 868 raise ValueError('No objects to concatenate')
869
870 if keys is None:
ValueError: No objects to concatenate |
||
expected.index = list('ABC') | ||
result = get_dummies(s_series_index, sparse=self.sparse, | ||
drop_first=True) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_basic_drop_first_one_level(self): | ||
# Test the case that categorical variable only has one level. | ||
s_list = list('aaa') | ||
s_series = Series(s_list) | ||
s_series_index = Series(s_list, list('ABC')) | ||
|
||
expected = DataFrame(index=np.arange(3)) | ||
|
||
result = get_dummies(s_list, sparse=self.sparse, drop_first=True) | ||
assert_frame_equal(result, expected) | ||
|
||
result = get_dummies(s_series, sparse=self.sparse, drop_first=True) | ||
assert_frame_equal(result, expected) | ||
|
||
expected = DataFrame(index=list('ABC')) | ||
result = get_dummies(s_series_index, sparse=self.sparse, | ||
drop_first=True) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_basic_drop_first_NA(self): | ||
# Test NA hadling together with drop_first | ||
s_NA = ['a', 'b', np.nan] | ||
res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) | ||
exp = DataFrame({'b': {0: 0.0, | ||
1: 1.0, | ||
2: 0.0}}) | ||
assert_frame_equal(res, exp) | ||
|
||
res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, | ||
drop_first=True) | ||
exp_na = DataFrame({'b': {0: 0.0, | ||
1: 1.0, | ||
2: 0.0}, | ||
nan: {0: 0.0, | ||
1: 0.0, | ||
2: 1.0}}).reindex_axis( | ||
['b', nan], 1) | ||
assert_frame_equal(res_na, exp_na) | ||
|
||
res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, | ||
drop_first=True) | ||
exp_just_na = DataFrame(index=np.arange(1)) | ||
assert_frame_equal(res_just_na, exp_just_na) | ||
|
||
def test_dataframe_dummies_drop_first(self): | ||
df = self.df[['A', 'B']] | ||
result = get_dummies(df, sparse=self.sparse, drop_first=True) | ||
expected = DataFrame({'A_b': [0., 1, 0], | ||
'B_c': [0., 0, 1]}) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_dataframe_dummies_drop_first_with_categorical(self): | ||
df = self.df | ||
df['cat'] = pd.Categorical(['x', 'y', 'y']) | ||
result = get_dummies(df, sparse=self.sparse, drop_first=True) | ||
expected = DataFrame({'C': [1, 2, 3], | ||
'A_b': [0., 1, 0], | ||
'B_c': [0., 0, 1], | ||
'cat_y': [0., 1, 1]}) | ||
expected = expected[['C', 'A_b', 'B_c', 'cat_y']] | ||
assert_frame_equal(result, expected) | ||
|
||
def test_dataframe_dummies_drop_first_with_na(self): | ||
df = self.df | ||
df.loc[3, :] = [np.nan, np.nan, np.nan] | ||
result = get_dummies(df, dummy_na=True, sparse=self.sparse, | ||
drop_first=True) | ||
expected = DataFrame({'C': [1, 2, 3, np.nan], | ||
'A_b': [0., 1, 0, 0], | ||
'A_nan': [0., 0, 0, 1], | ||
'B_c': [0., 0, 1, 0], | ||
'B_nan': [0., 0, 0, 1]}) | ||
expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] | ||
assert_frame_equal(result, expected) | ||
|
||
result = get_dummies(df, dummy_na=False, sparse=self.sparse, | ||
drop_first=True) | ||
expected = expected[['C', 'A_b', 'B_c']] | ||
assert_frame_equal(result, expected) | ||
|
||
|
||
class TestGetDummiesSparse(TestGetDummies): | ||
sparse = True | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add the issue number as a comment