Skip to content

Commit 77b3a4e

Browse files
BranYangcldy
authored andcommitted
ENH: GH12042 Add parameter drop_first to get_dummies to get n-1 variables out of n levels.
closes pandas-dev#12042 Some times it's useful to only accept n-1 variables out of n categorical levels. Author: Bran Yang <[email protected]> Closes pandas-dev#12092 from BranYang/master and squashes the following commits: 0528c57 [Bran Yang] Compare with empty DataFrame, not just check empty 0d99c2a [Bran Yang] Test the case that `drop_first` is on and categorical variable only has one level. 45f14e8 [Bran Yang] ENH: GH12042 Add parameter `drop_first` to get_dummies to get k-1 variables out of n levels.
1 parent f791b1b commit 77b3a4e

File tree

3 files changed

+173
-6
lines changed

3 files changed

+173
-6
lines changed

doc/source/reshaping.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
539539
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
540540
from_dict
541541
542+
.. versionadded:: 0.18.0
543+
544+
Sometimes it will be useful to only keep k-1 levels of a categorical
545+
variable to avoid collinearity when feeding the result to statistical models.
546+
You can switch to this mode by turn on ``drop_first``.
547+
548+
.. ipython:: python
549+
550+
s = pd.Series(list('abcaa'))
551+
552+
pd.get_dummies(s)
553+
554+
pd.get_dummies(s, drop_first=True)
555+
556+
When a column contains only one level, it will be omitted in the result.
557+
558+
.. ipython:: python
559+
560+
df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')})
561+
562+
pd.get_dummies(df)
563+
564+
pd.get_dummies(df, drop_first=True)
565+
566+
567+
542568
Factorizing values
543569
------------------
544570

pandas/core/reshape.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ def melt_stub(df, stub, i, j):
950950

951951

952952
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
953-
columns=None, sparse=False):
953+
columns=None, sparse=False, drop_first=False):
954954
"""
955955
Convert categorical variable into dummy/indicator variables
956956
@@ -977,7 +977,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
977977
Otherwise returns a DataFrame with some SparseBlocks.
978978
979979
.. versionadded:: 0.16.1
980+
drop_first : bool, default False
981+
Whether to get k-1 dummies out of n categorical levels by removing the
982+
first level.
980983
984+
.. versionadded:: 0.18.0
981985
Returns
982986
-------
983987
dummies : DataFrame or SparseDataFrame
@@ -1017,6 +1021,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
10171021
1 2 0 1 1 0 0
10181022
2 3 1 0 0 0 1
10191023
1024+
>>> pd.get_dummies(pd.Series(list('abcaa')))
1025+
a b c
1026+
0 1 0 0
1027+
1 0 1 0
1028+
2 0 0 1
1029+
3 1 0 0
1030+
4 1 0 0
1031+
1032+
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
1033+
b c
1034+
0 0 0
1035+
1 1 0
1036+
2 0 1
1037+
3 0 0
1038+
4 0 0
10201039
See also ``Series.str.get_dummies``.
10211040
10221041
"""
@@ -1066,23 +1085,23 @@ def check_len(item, name):
10661085
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
10671086

10681087
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
1069-
dummy_na=dummy_na, sparse=sparse)
1088+
dummy_na=dummy_na, sparse=sparse,
1089+
drop_first=drop_first)
10701090
with_dummies.append(dummy)
10711091
result = concat(with_dummies, axis=1)
10721092
else:
10731093
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
1074-
sparse=sparse)
1094+
sparse=sparse, drop_first=drop_first)
10751095
return result
10761096

10771097

10781098
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
1079-
sparse=False):
1099+
sparse=False, drop_first=False):
10801100
# Series avoids inconsistent NaN handling
10811101
cat = Categorical.from_array(Series(data), ordered=True)
10821102
levels = cat.categories
10831103

1084-
# if all NaN
1085-
if not dummy_na and len(levels) == 0:
1104+
def get_empty_Frame(data, sparse):
10861105
if isinstance(data, Series):
10871106
index = data.index
10881107
else:
@@ -1092,11 +1111,19 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
10921111
else:
10931112
return SparseDataFrame(index=index)
10941113

1114+
# if all NaN
1115+
if not dummy_na and len(levels) == 0:
1116+
return get_empty_Frame(data, sparse)
1117+
10951118
codes = cat.codes.copy()
10961119
if dummy_na:
10971120
codes[codes == -1] = len(cat.categories)
10981121
levels = np.append(cat.categories, np.nan)
10991122

1123+
# if dummy_na, we just fake a nan level. drop_first will drop it again
1124+
if drop_first and len(levels) == 1:
1125+
return get_empty_Frame(data, sparse)
1126+
11001127
number_of_cols = len(levels)
11011128

11021129
if prefix is not None:
@@ -1119,6 +1146,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
11191146
continue
11201147
sp_indices[code].append(ndx)
11211148

1149+
if drop_first:
1150+
# remove first categorical level to avoid perfect collinearity
1151+
# GH12042
1152+
sp_indices = sp_indices[1:]
1153+
dummy_cols = dummy_cols[1:]
11221154
for col, ixs in zip(dummy_cols, sp_indices):
11231155
sarr = SparseArray(np.ones(len(ixs)),
11241156
sparse_index=IntIndex(N, ixs), fill_value=0)
@@ -1133,6 +1165,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
11331165
# reset NaN GH4446
11341166
dummy_mat[codes == -1] = 0
11351167

1168+
if drop_first:
1169+
# remove first GH12042
1170+
dummy_mat = dummy_mat[:, 1:]
1171+
dummy_cols = dummy_cols[1:]
11361172
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
11371173

11381174

pandas/tests/test_reshape.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,111 @@ def test_dataframe_dummies_with_categorical(self):
411411
]]
412412
assert_frame_equal(result, expected)
413413

414+
# GH12402 Add a new parameter `drop_first` to avoid collinearity
415+
def test_basic_drop_first(self):
416+
# Basic case
417+
s_list = list('abc')
418+
s_series = Series(s_list)
419+
s_series_index = Series(s_list, list('ABC'))
420+
421+
expected = DataFrame({'b': {0: 0.0,
422+
1: 1.0,
423+
2: 0.0},
424+
'c': {0: 0.0,
425+
1: 0.0,
426+
2: 1.0}})
427+
428+
result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
429+
assert_frame_equal(result, expected)
430+
431+
result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
432+
assert_frame_equal(result, expected)
433+
434+
expected.index = list('ABC')
435+
result = get_dummies(s_series_index, sparse=self.sparse,
436+
drop_first=True)
437+
assert_frame_equal(result, expected)
438+
439+
def test_basic_drop_first_one_level(self):
440+
# Test the case that categorical variable only has one level.
441+
s_list = list('aaa')
442+
s_series = Series(s_list)
443+
s_series_index = Series(s_list, list('ABC'))
444+
445+
expected = DataFrame(index=np.arange(3))
446+
447+
result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
448+
assert_frame_equal(result, expected)
449+
450+
result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
451+
assert_frame_equal(result, expected)
452+
453+
expected = DataFrame(index=list('ABC'))
454+
result = get_dummies(s_series_index, sparse=self.sparse,
455+
drop_first=True)
456+
assert_frame_equal(result, expected)
457+
458+
def test_basic_drop_first_NA(self):
459+
# Test NA hadling together with drop_first
460+
s_NA = ['a', 'b', np.nan]
461+
res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
462+
exp = DataFrame({'b': {0: 0.0,
463+
1: 1.0,
464+
2: 0.0}})
465+
assert_frame_equal(res, exp)
466+
467+
res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
468+
drop_first=True)
469+
exp_na = DataFrame({'b': {0: 0.0,
470+
1: 1.0,
471+
2: 0.0},
472+
nan: {0: 0.0,
473+
1: 0.0,
474+
2: 1.0}}).reindex_axis(
475+
['b', nan], 1)
476+
assert_frame_equal(res_na, exp_na)
477+
478+
res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
479+
drop_first=True)
480+
exp_just_na = DataFrame(index=np.arange(1))
481+
assert_frame_equal(res_just_na, exp_just_na)
482+
483+
def test_dataframe_dummies_drop_first(self):
484+
df = self.df[['A', 'B']]
485+
result = get_dummies(df, sparse=self.sparse, drop_first=True)
486+
expected = DataFrame({'A_b': [0., 1, 0],
487+
'B_c': [0., 0, 1]})
488+
assert_frame_equal(result, expected)
489+
490+
def test_dataframe_dummies_drop_first_with_categorical(self):
491+
df = self.df
492+
df['cat'] = pd.Categorical(['x', 'y', 'y'])
493+
result = get_dummies(df, sparse=self.sparse, drop_first=True)
494+
expected = DataFrame({'C': [1, 2, 3],
495+
'A_b': [0., 1, 0],
496+
'B_c': [0., 0, 1],
497+
'cat_y': [0., 1, 1]})
498+
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
499+
assert_frame_equal(result, expected)
500+
501+
def test_dataframe_dummies_drop_first_with_na(self):
502+
df = self.df
503+
df.loc[3, :] = [np.nan, np.nan, np.nan]
504+
result = get_dummies(df, dummy_na=True, sparse=self.sparse,
505+
drop_first=True)
506+
expected = DataFrame({'C': [1, 2, 3, np.nan],
507+
'A_b': [0., 1, 0, 0],
508+
'A_nan': [0., 0, 0, 1],
509+
'B_c': [0., 0, 1, 0],
510+
'B_nan': [0., 0, 0, 1]})
511+
expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
512+
assert_frame_equal(result, expected)
513+
514+
result = get_dummies(df, dummy_na=False, sparse=self.sparse,
515+
drop_first=True)
516+
expected = expected[['C', 'A_b', 'B_c']]
517+
assert_frame_equal(result, expected)
518+
414519

415520
class TestGetDummiesSparse(TestGetDummies):
416521
sparse = True

0 commit comments

Comments
 (0)