Skip to content

ENH: GH12042 Add parameter drop_first to get_dummies to get n-1 variables out of n levels. #12092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict

.. versionadded:: 0.18.0

Sometimes it will be useful to only keep k-1 levels of a categorical
variable to avoid collinearity when feeding the result to statistical models.
You can switch to this mode by turn on ``drop_first``.

.. ipython:: python

s = pd.Series(list('abcaa'))

pd.get_dummies(s)

pd.get_dummies(s, drop_first=True)

When a column contains only one level, it will be omitted in the result.

.. ipython:: python

df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')})

pd.get_dummies(df)

pd.get_dummies(df, drop_first=True)



Factorizing values
------------------

Expand Down
48 changes: 42 additions & 6 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ def melt_stub(df, stub, i, j):


def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None, sparse=False):
columns=None, sparse=False, drop_first=False):
"""
Convert categorical variable into dummy/indicator variables

Expand All @@ -971,7 +971,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
Otherwise returns a DataFrame with some SparseBlocks.

.. versionadded:: 0.16.1
drop_first : bool, default False
Whether to get k-1 dummies out of n categorical levels by removing the
first level.

.. versionadded:: 0.18.0
Returns
-------
dummies : DataFrame or SparseDataFrame
Expand Down Expand Up @@ -1011,6 +1015,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
1 2 0 1 1 0 0
2 3 1 0 0 0 1

>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
4 1 0 0

>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0
See also ``Series.str.get_dummies``.

"""
Expand Down Expand Up @@ -1060,23 +1079,23 @@ def check_len(item, name):
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):

dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
dummy_na=dummy_na, sparse=sparse)
dummy_na=dummy_na, sparse=sparse,
drop_first=drop_first)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
sparse=sparse)
sparse=sparse, drop_first=drop_first)
return result


def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
sparse=False):
sparse=False, drop_first=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data), ordered=True)
levels = cat.categories

# if all NaN
if not dummy_na and len(levels) == 0:
def get_empty_Frame(data, sparse):
if isinstance(data, Series):
index = data.index
else:
Expand All @@ -1086,11 +1105,19 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
else:
return SparseDataFrame(index=index)

# if all NaN
if not dummy_na and len(levels) == 0:
return get_empty_Frame(data, sparse)

codes = cat.codes.copy()
if dummy_na:
codes[codes == -1] = len(cat.categories)
levels = np.append(cat.categories, np.nan)

# if dummy_na, we just fake a nan level. drop_first will drop it again
if drop_first and len(levels) == 1:
return get_empty_Frame(data, sparse)

number_of_cols = len(levels)

if prefix is not None:
Expand All @@ -1113,6 +1140,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
continue
sp_indices[code].append(ndx)

if drop_first:
# remove first categorical level to avoid perfect collinearity
# GH12042
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs)),
sparse_index=IntIndex(N, ixs), fill_value=0)
Expand All @@ -1127,6 +1159,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
# reset NaN GH4446
dummy_mat[codes == -1] = 0

if drop_first:
# remove first GH12042
dummy_mat = dummy_mat[:, 1:]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols)


Expand Down
105 changes: 105 additions & 0 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,111 @@ def test_dataframe_dummies_with_categorical(self):
]]
assert_frame_equal(result, expected)

# GH12402 Add a new parameter `drop_first` to avoid collinearity
def test_basic_drop_first(self):
# Basic case
s_list = list('abc')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))

expected = DataFrame({'b': {0: 0.0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue number as a comment

1: 1.0,
2: 0.0},
'c': {0: 0.0,
1: 0.0,
2: 1.0}})

result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have the empty case tested somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean the case that we only have 1 level in a categorical variable? I will add a case to test this - the result should be empty.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. also test a completely empty frame as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback By "empty frame", do you mean this case

In [1]: import pandas as pd

In [2]: pd.get_dummies(pd.Series())
Out[2]:
Empty DataFrame
Columns: []
Index: []

In [3]: pd.get_dummies(pd.Series(), drop_first=True)
Out[3]:
Empty DataFrame
Columns: []
Index: []

Or this case

In [4]: pd.get_dummies(pd.DataFrame())

But this will raise an error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-059be6e25969> in <module>()
----> 1 pd.get_dummies(pd.DataFrame())

C:\D\Projects\Github\pandas\pandas\core\reshape.py in get_dummies(data, prefix,
prefix_sep, dummy_na, columns, sparse, drop_first)
   1083                                     drop_first=drop_first)
   1084             with_dummies.append(dummy)
-> 1085         result = concat(with_dummies, axis=1)
   1086     else:
   1087         result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,

C:\D\Projects\Github\pandas\pandas\tools\merge.py in concat(objs, axis, join, jo
in_axes, ignore_index, keys, levels, names, verify_integrity, copy)
    833                        keys=keys, levels=levels, names=names,
    834                        verify_integrity=verify_integrity,
--> 835                        copy=copy)
    836     return op.get_result()
    837

C:\D\Projects\Github\pandas\pandas\tools\merge.py in __init__(self, objs, axis,
join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
    866
    867         if len(objs) == 0:
--> 868             raise ValueError('No objects to concatenate')
    869
    870         if keys is None:

ValueError: No objects to concatenate

expected.index = list('ABC')
result = get_dummies(s_series_index, sparse=self.sparse,
drop_first=True)
assert_frame_equal(result, expected)

def test_basic_drop_first_one_level(self):
# Test the case that categorical variable only has one level.
s_list = list('aaa')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))

expected = DataFrame(index=np.arange(3))

result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

expected = DataFrame(index=list('ABC'))
result = get_dummies(s_series_index, sparse=self.sparse,
drop_first=True)
assert_frame_equal(result, expected)

def test_basic_drop_first_NA(self):
# Test NA hadling together with drop_first
s_NA = ['a', 'b', np.nan]
res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
exp = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0}})
assert_frame_equal(res, exp)

res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
drop_first=True)
exp_na = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0},
nan: {0: 0.0,
1: 0.0,
2: 1.0}}).reindex_axis(
['b', nan], 1)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
drop_first=True)
exp_just_na = DataFrame(index=np.arange(1))
assert_frame_equal(res_just_na, exp_just_na)

def test_dataframe_dummies_drop_first(self):
df = self.df[['A', 'B']]
result = get_dummies(df, sparse=self.sparse, drop_first=True)
expected = DataFrame({'A_b': [0., 1, 0],
'B_c': [0., 0, 1]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(self):
df = self.df
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df, sparse=self.sparse, drop_first=True)
expected = DataFrame({'C': [1, 2, 3],
'A_b': [0., 1, 0],
'B_c': [0., 0, 1],
'cat_y': [0., 1, 1]})
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_na(self):
df = self.df
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True, sparse=self.sparse,
drop_first=True)
expected = DataFrame({'C': [1, 2, 3, np.nan],
'A_b': [0., 1, 0, 0],
'A_nan': [0., 0, 0, 1],
'B_c': [0., 0, 1, 0],
'B_nan': [0., 0, 0, 1]})
expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False, sparse=self.sparse,
drop_first=True)
expected = expected[['C', 'A_b', 'B_c']]
assert_frame_equal(result, expected)


class TestGetDummiesSparse(TestGetDummies):
sparse = True
Expand Down