Skip to content

ENH: GH12042 Add parameter drop_first to get_dummies to get n-1 variables out of n levels. #12092

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict

.. versionadded:: 0.18.0

Sometimes it will be useful to only keep k-1 levels of a categorical
variable to avoid collinearity when feeding the result to statistical models.
You can switch to this mode by turn on ``drop_first``.

.. ipython:: python

s = pd.Series(list('abcaa'))

pd.get_dummies(s)

pd.get_dummies(s, drop_first=True)

When a column contains only one level, it will be omitted in the result.

.. ipython:: python

df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')})

pd.get_dummies(df)

pd.get_dummies(df, drop_first=True)



Factorizing values
------------------

Expand Down
37 changes: 33 additions & 4 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ def melt_stub(df, stub, i, j):


def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None, sparse=False):
columns=None, sparse=False, drop_first=False):
"""
Convert categorical variable into dummy/indicator variables

Expand All @@ -971,7 +971,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
Otherwise returns a DataFrame with some SparseBlocks.

.. versionadded:: 0.16.1
drop_first : bool, default False
Whether to get k-1 dummies out of n categorical levels by removing the
first level.

.. versionadded:: 0.18.0
Returns
-------
dummies : DataFrame or SparseDataFrame
Expand Down Expand Up @@ -1011,6 +1015,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
1 2 0 1 1 0 0
2 3 1 0 0 0 1

>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
4 1 0 0

>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0
See also ``Series.str.get_dummies``.

"""
Expand Down Expand Up @@ -1060,17 +1079,18 @@ def check_len(item, name):
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):

dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
dummy_na=dummy_na, sparse=sparse)
dummy_na=dummy_na, sparse=sparse,
drop_first=drop_first)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
sparse=sparse)
sparse=sparse, drop_first=drop_first)
return result


def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
sparse=False):
sparse=False, drop_first=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data), ordered=True)
levels = cat.categories
Expand Down Expand Up @@ -1113,6 +1133,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
continue
sp_indices[code].append(ndx)

if drop_first:
# remove first categorical level to avoid perfect collinearity
# GH12042
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs)),
sparse_index=IntIndex(N, ixs), fill_value=0)
Expand All @@ -1127,6 +1152,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
# reset NaN GH4446
dummy_mat[codes == -1] = 0

if drop_first:
# remove first GH12042
dummy_mat = dummy_mat[:, 1:]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols)


Expand Down
90 changes: 90 additions & 0 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,96 @@ def test_dataframe_dummies_with_categorical(self):
]]
assert_frame_equal(result, expected)

# GH12402 Add a new parameter `drop_first` to avoid collinearity
def test_basic_drop_first(self):
# Basic case
s_list = list('abc')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))

expected = DataFrame({'b': {0: 0.0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue number as a comment

1: 1.0,
2: 0.0},
'c': {0: 0.0,
1: 0.0,
2: 1.0}})

result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have the empty case tested somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean the case that we only have 1 level in a categorical variable? I will add a case to test this - the result should be empty.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. also test a completely empty frame as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback By "empty frame", do you mean this case

In [1]: import pandas as pd

In [2]: pd.get_dummies(pd.Series())
Out[2]:
Empty DataFrame
Columns: []
Index: []

In [3]: pd.get_dummies(pd.Series(), drop_first=True)
Out[3]:
Empty DataFrame
Columns: []
Index: []

Or this case

In [4]: pd.get_dummies(pd.DataFrame())

But this will raise an error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-059be6e25969> in <module>()
----> 1 pd.get_dummies(pd.DataFrame())

C:\D\Projects\Github\pandas\pandas\core\reshape.py in get_dummies(data, prefix,
prefix_sep, dummy_na, columns, sparse, drop_first)
   1083                                     drop_first=drop_first)
   1084             with_dummies.append(dummy)
-> 1085         result = concat(with_dummies, axis=1)
   1086     else:
   1087         result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,

C:\D\Projects\Github\pandas\pandas\tools\merge.py in concat(objs, axis, join, jo
in_axes, ignore_index, keys, levels, names, verify_integrity, copy)
    833                        keys=keys, levels=levels, names=names,
    834                        verify_integrity=verify_integrity,
--> 835                        copy=copy)
    836     return op.get_result()
    837

C:\D\Projects\Github\pandas\pandas\tools\merge.py in __init__(self, objs, axis,
join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
    866
    867         if len(objs) == 0:
--> 868             raise ValueError('No objects to concatenate')
    869
    870         if keys is None:

ValueError: No objects to concatenate

expected.index = list('ABC')
result = get_dummies(s_series_index, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

# Test the case that categorical variable only has one level.
def test_basic_drop_first_one_level(self):
result = get_dummies(list('aaa'), sparse=self.sparse, drop_first=True)
self.assertEqual(result.empty, True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compare this with an actual constructed empty frame as this will verify that the indexes are correct.


def test_basic_drop_first_NA(self):
# Test NA hadling together with drop_first
s_NA = ['a', 'b', np.nan]
res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
exp = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0}})
assert_frame_equal(res, exp)

# Sparse dataframes do not allow nan labelled columns, see #GH8822
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so what does sparse do in that case? pls tests that as well

res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
drop_first=True)
exp_na = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0},
nan: {0: 0.0,
1: 0.0,
2: 1.0}}).reindex_axis(
['b', nan], 1)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
drop_first=True)
tm.assert_numpy_array_equal(res_just_na.empty, True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't make any sense. again compare against an expected frame.


def test_dataframe_dummies_drop_first(self):
df = self.df[['A', 'B']]
result = get_dummies(df, sparse=self.sparse, drop_first=True)
expected = DataFrame({'A_b': [0., 1, 0],
'B_c': [0., 0, 1]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(self):
df = self.df
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df, sparse=self.sparse, drop_first=True)
expected = DataFrame({'C': [1, 2, 3],
'A_b': [0., 1, 0],
'B_c': [0., 0, 1],
'cat_y': [0., 1, 1]})
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_na(self):
df = self.df
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True, sparse=self.sparse,
drop_first=True)
expected = DataFrame({'C': [1, 2, 3, np.nan],
'A_b': [0., 1, 0, 0],
'A_nan': [0., 0, 0, 1],
'B_c': [0., 0, 1, 0],
'B_nan': [0., 0, 0, 1]})
expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False, sparse=self.sparse,
drop_first=True)
expected = expected[['C', 'A_b', 'B_c']]
assert_frame_equal(result, expected)


class TestGetDummiesSparse(TestGetDummies):
sparse = True
Expand Down