Skip to content

Commit 45f14e8

Browse files
committed
ENH: GH12042 Add parameter drop_first to get_dummies to get k-1 variables out of n levels.
1 parent 9bc8243 commit 45f14e8

File tree

3 files changed

+143
-3
lines changed

3 files changed

+143
-3
lines changed

doc/source/reshaping.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
518518
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
519519
from_dict
520520
521+
.. versionadded:: 0.18.0
522+
523+
Sometimes it will be useful to only keep k-1 levels of a categorical
524+
variable to avoid collinearity when feeding the result to statistical models.
525+
You can switch to this mode by turn on ``drop_first``.
526+
527+
.. ipython:: python
528+
529+
s = pd.Series(list('abcaa'))
530+
531+
pd.get_dummies(s)
532+
533+
pd.get_dummies(s, drop_first=True)
534+
535+
When a column contains only one level, it will be omitted in the result.
536+
537+
.. ipython:: python
538+
539+
df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')})
540+
541+
pd.get_dummies(df)
542+
543+
pd.get_dummies(df, drop_first=True)
544+
545+
546+
521547
Factorizing values
522548
------------------
523549

pandas/core/reshape.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
971971
Otherwise returns a DataFrame with some SparseBlocks.
972972
973973
.. versionadded:: 0.16.1
974+
drop_first : bool, default False
975+
Whether to get k-1 dummies out of n categorical levels by removing the
976+
first level.
974977
978+
.. versionadded:: 0.18.0
975979
Returns
976980
-------
977981
dummies : DataFrame or SparseDataFrame
@@ -1011,6 +1015,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
10111015
1 2 0 1 1 0 0
10121016
2 3 1 0 0 0 1
10131017
1018+
>>> pd.get_dummies(pd.Series(list('abcaa')))
1019+
a b c
1020+
0 1 0 0
1021+
1 0 1 0
1022+
2 0 0 1
1023+
3 1 0 0
1024+
4 1 0 0
1025+
1026+
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
1027+
b c
1028+
0 0 0
1029+
1 1 0
1030+
2 0 1
1031+
3 0 0
1032+
4 0 0
10141033
See also ``Series.str.get_dummies``.
10151034
10161035
"""
@@ -1060,17 +1079,18 @@ def check_len(item, name):
10601079
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
10611080

10621081
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
1063-
dummy_na=dummy_na, sparse=sparse)
1082+
dummy_na=dummy_na, sparse=sparse,
1083+
drop_first=drop_first)
10641084
with_dummies.append(dummy)
10651085
result = concat(with_dummies, axis=1)
10661086
else:
10671087
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
1068-
sparse=sparse)
1088+
sparse=sparse, drop_first=drop_first)
10691089
return result
10701090

10711091

10721092
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
1073-
sparse=False):
1093+
sparse=False, drop_first=False):
10741094
# Series avoids inconsistent NaN handling
10751095
cat = Categorical.from_array(Series(data), ordered=True)
10761096
levels = cat.categories
@@ -1113,6 +1133,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
11131133
continue
11141134
sp_indices[code].append(ndx)
11151135

1136+
if drop_first:
1137+
# remove first categorical level to avoid perfect collinearity
1138+
# GH12042
1139+
sp_indices = sp_indices[1:]
1140+
dummy_cols = dummy_cols[1:]
11161141
for col, ixs in zip(dummy_cols, sp_indices):
11171142
sarr = SparseArray(np.ones(len(ixs)),
11181143
sparse_index=IntIndex(N, ixs), fill_value=0)
@@ -1127,6 +1152,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
11271152
# reset NaN GH4446
11281153
dummy_mat[codes == -1] = 0
11291154

1155+
if drop_first:
1156+
# remove first GH12042
1157+
dummy_mat = dummy_mat[:, 1:]
1158+
dummy_cols = dummy_cols[1:]
11301159
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
11311160

11321161

pandas/tests/test_reshape.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,91 @@ def test_dataframe_dummies_with_categorical(self):
411411
]]
412412
assert_frame_equal(result, expected)
413413

414+
# GH12402 Add a new parameter `drop_first` to avoid collinearity
415+
def test_basic_drop_first(self):
416+
# Basic case
417+
s_list = list('abc')
418+
s_series = Series(s_list)
419+
s_series_index = Series(s_list, list('ABC'))
420+
421+
expected = DataFrame({'b': {0: 0.0,
422+
1: 1.0,
423+
2: 0.0},
424+
'c': {0: 0.0,
425+
1: 0.0,
426+
2: 1.0}})
427+
428+
result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
429+
assert_frame_equal(result, expected)
430+
431+
result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
432+
assert_frame_equal(result, expected)
433+
434+
expected.index = list('ABC')
435+
result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True)
436+
assert_frame_equal(result, expected)
437+
438+
def test_basic_drop_first_NA(self):
439+
# Test NA hadling together with drop_first
440+
s_NA = ['a', 'b', np.nan]
441+
res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
442+
exp = DataFrame({'b': {0: 0.0,
443+
1: 1.0,
444+
2: 0.0}})
445+
assert_frame_equal(res, exp)
446+
447+
# Sparse dataframes do not allow nan labelled columns, see #GH8822
448+
res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
449+
drop_first=True)
450+
exp_na = DataFrame({'b': {0: 0.0,
451+
1: 1.0,
452+
2: 0.0},
453+
nan: {0: 0.0,
454+
1: 0.0,
455+
2: 1.0}}).reindex_axis(
456+
['b', nan], 1)
457+
assert_frame_equal(res_na, exp_na)
458+
459+
res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
460+
drop_first=True)
461+
tm.assert_numpy_array_equal(res_just_na.empty, True)
462+
463+
def test_dataframe_dummies_drop_first(self):
464+
df = self.df[['A', 'B']]
465+
result = get_dummies(df, sparse=self.sparse, drop_first=True)
466+
expected = DataFrame({'A_b': [0., 1, 0],
467+
'B_c': [0., 0, 1]})
468+
assert_frame_equal(result, expected)
469+
470+
def test_dataframe_dummies_drop_first_with_categorical(self):
471+
df = self.df
472+
df['cat'] = pd.Categorical(['x', 'y', 'y'])
473+
result = get_dummies(df, sparse=self.sparse, drop_first=True)
474+
expected = DataFrame({'C': [1, 2, 3],
475+
'A_b': [0., 1, 0],
476+
'B_c': [0., 0, 1],
477+
'cat_y': [0., 1, 1]})
478+
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
479+
assert_frame_equal(result, expected)
480+
481+
def test_dataframe_dummies_drop_first_with_na(self):
482+
df = self.df
483+
df.loc[3, :] = [np.nan, np.nan, np.nan]
484+
result = get_dummies(df, dummy_na=True, sparse=self.sparse,
485+
drop_first=True)
486+
expected = DataFrame({'C': [1, 2, 3, np.nan],
487+
'A_b': [0., 1, 0, 0],
488+
'A_nan': [0., 0, 0, 1],
489+
'B_c': [0., 0, 1, 0],
490+
'B_nan': [0., 0, 0, 1]})
491+
expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
492+
assert_frame_equal(result, expected)
493+
494+
result = get_dummies(df, dummy_na=False, sparse=self.sparse,
495+
drop_first=True)
496+
expected = expected[['C', 'A_b', 'B_c']]
497+
assert_frame_equal(result, expected)
498+
414499

415500
class TestGetDummiesSparse(TestGetDummies):
416501
sparse = True

0 commit comments

Comments
 (0)