Skip to content

Commit ed5136a

Browse files
committed
ENH: Warn when dtype is not passed to get_dummies
1 parent 6db95e7 commit ed5136a

File tree

2 files changed

+61
-34
lines changed

2 files changed

+61
-34
lines changed

pandas/core/reshape/encoding.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from collections import defaultdict
44
import itertools
55
from typing import Hashable
6+
import warnings
67

78
import numpy as np
89

@@ -20,6 +21,7 @@
2021
from pandas.core.frame import DataFrame
2122
from pandas.core.indexes.api import Index
2223
from pandas.core.series import Series
24+
from pandas.util._exceptions import find_stack_level
2325

2426

2527
def get_dummies(
@@ -228,6 +230,12 @@ def _get_dummies_1d(
228230
codes, levels = factorize_from_iterable(Series(data))
229231

230232
if dtype is None:
233+
warnings.warn(
234+
"The default dtype will change from 'uint8' to 'bool', "
235+
"please specify a dtype to silence this warning",
236+
FutureWarning,
237+
stacklevel=find_stack_level(),
238+
)
231239
dtype = np.dtype(np.uint8)
232240
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
233241
# dtype[Any], Type[object]]"; expected "Type[Any]"

pandas/tests/reshape/test_get_dummies.py

Lines changed: 53 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ def test_get_dummies_raises_on_dtype_object(self, df):
4545
with pytest.raises(ValueError, match=msg):
4646
get_dummies(df, dtype="object")
4747

48+
def test_get_dummies_warns_default_dtype(self, df):
49+
msg = "The default dtype will change from 'uint8' to 'bool'"
50+
with pytest.warns(FutureWarning, match=msg):
51+
get_dummies(df)
52+
4853
def test_get_dummies_basic(self, sparse, dtype):
4954
s_list = list("abc")
5055
s_series = Series(s_list)
@@ -121,9 +126,11 @@ def test_get_dummies_just_na(self, sparse):
121126
just_na_series = Series(just_na_list)
122127
just_na_series_index = Series(just_na_list, index=["A"])
123128

124-
res_list = get_dummies(just_na_list, sparse=sparse)
125-
res_series = get_dummies(just_na_series, sparse=sparse)
126-
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
129+
res_list = get_dummies(just_na_list, dtype=np.uint8, sparse=sparse)
130+
res_series = get_dummies(just_na_series, dtype=np.uint8, sparse=sparse)
131+
res_series_index = get_dummies(
132+
just_na_series_index, dtype=np.uint8, sparse=sparse
133+
)
127134

128135
assert res_list.empty
129136
assert res_series.empty
@@ -169,7 +176,7 @@ def test_get_dummies_unicode(self, sparse):
169176
e = "e"
170177
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
171178
s = [e, eacute, eacute]
172-
res = get_dummies(s, prefix="letter", sparse=sparse)
179+
res = get_dummies(s, dtype=np.uint8, prefix="letter", sparse=sparse)
173180
exp = DataFrame(
174181
{"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
175182
)
@@ -179,7 +186,7 @@ def test_get_dummies_unicode(self, sparse):
179186

180187
def test_dataframe_dummies_all_obj(self, df, sparse):
181188
df = df[["A", "B"]]
182-
result = get_dummies(df, sparse=sparse)
189+
result = get_dummies(df, dtype=np.uint8, sparse=sparse)
183190
expected = DataFrame(
184191
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
185192
dtype=np.uint8,
@@ -200,7 +207,7 @@ def test_dataframe_dummies_string_dtype(self, df):
200207
# GH44965
201208
df = df[["A", "B"]]
202209
df = df.astype({"A": "object", "B": "string"})
203-
result = get_dummies(df)
210+
result = get_dummies(df, dtype=np.uint8)
204211
expected = DataFrame(
205212
{
206213
"A_a": [1, 0, 1],
@@ -234,7 +241,7 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
234241

235242
def test_dataframe_dummies_prefix_list(self, df, sparse):
236243
prefixes = ["from_A", "from_B"]
237-
result = get_dummies(df, prefix=prefixes, sparse=sparse)
244+
result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse)
238245
expected = DataFrame(
239246
{
240247
"C": [1, 2, 3],
@@ -255,7 +262,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
255262

256263
def test_dataframe_dummies_prefix_str(self, df, sparse):
257264
# not that you should do this...
258-
result = get_dummies(df, prefix="bad", sparse=sparse)
265+
result = get_dummies(df, dtype=np.uint8, prefix="bad", sparse=sparse)
259266
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
260267
expected = DataFrame(
261268
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
@@ -280,7 +287,9 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
280287
tm.assert_frame_equal(result, expected)
281288

282289
def test_dataframe_dummies_subset(self, df, sparse):
283-
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
290+
result = get_dummies(
291+
df, dtype=np.uint8, prefix=["from_A"], columns=["A"], sparse=sparse
292+
)
284293
expected = DataFrame(
285294
{
286295
"B": ["b", "b", "c"],
@@ -298,7 +307,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
298307
tm.assert_frame_equal(result, expected)
299308

300309
def test_dataframe_dummies_prefix_sep(self, df, sparse):
301-
result = get_dummies(df, prefix_sep="..", sparse=sparse)
310+
result = get_dummies(df, dtype=np.uint8, prefix_sep="..", sparse=sparse)
302311
expected = DataFrame(
303312
{
304313
"C": [1, 2, 3],
@@ -317,11 +326,13 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
317326

318327
tm.assert_frame_equal(result, expected)
319328

320-
result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
329+
result = get_dummies(df, dtype=np.uint8, prefix_sep=["..", "__"], sparse=sparse)
321330
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
322331
tm.assert_frame_equal(result, expected)
323332

324-
result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
333+
result = get_dummies(
334+
df, dtype=np.uint8, prefix_sep={"A": "..", "B": "__"}, sparse=sparse
335+
)
325336
tm.assert_frame_equal(result, expected)
326337

327338
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
@@ -330,20 +341,20 @@ def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
330341
"encoded (2)"
331342
)
332343
with pytest.raises(ValueError, match=msg):
333-
get_dummies(df, prefix=["too few"], sparse=sparse)
344+
get_dummies(df, dtype=np.uint8, prefix=["too few"], sparse=sparse)
334345

335346
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
336347
msg = re.escape(
337348
"Length of 'prefix_sep' (1) did not match the length of the columns being "
338349
"encoded (2)"
339350
)
340351
with pytest.raises(ValueError, match=msg):
341-
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
352+
get_dummies(df, dtype=np.uint8, prefix_sep=["bad"], sparse=sparse)
342353

343354
def test_dataframe_dummies_prefix_dict(self, sparse):
344355
prefixes = {"A": "from_A", "B": "from_B"}
345356
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
346-
result = get_dummies(df, prefix=prefixes, sparse=sparse)
357+
result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse)
347358

348359
expected = DataFrame(
349360
{
@@ -453,16 +464,18 @@ def test_get_dummies_basic_drop_first(self, sparse):
453464

454465
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
455466

456-
result = get_dummies(s_list, drop_first=True, sparse=sparse)
467+
result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse)
457468
if sparse:
458469
expected = expected.apply(SparseArray, fill_value=0)
459470
tm.assert_frame_equal(result, expected)
460471

461-
result = get_dummies(s_series, drop_first=True, sparse=sparse)
472+
result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse)
462473
tm.assert_frame_equal(result, expected)
463474

464475
expected.index = list("ABC")
465-
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
476+
result = get_dummies(
477+
s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse
478+
)
466479
tm.assert_frame_equal(result, expected)
467480

468481
def test_get_dummies_basic_drop_first_one_level(self, sparse):
@@ -473,27 +486,31 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse):
473486

474487
expected = DataFrame(index=np.arange(3))
475488

476-
result = get_dummies(s_list, drop_first=True, sparse=sparse)
489+
result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse)
477490
tm.assert_frame_equal(result, expected)
478491

479-
result = get_dummies(s_series, drop_first=True, sparse=sparse)
492+
result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse)
480493
tm.assert_frame_equal(result, expected)
481494

482495
expected = DataFrame(index=list("ABC"))
483-
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
496+
result = get_dummies(
497+
s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse
498+
)
484499
tm.assert_frame_equal(result, expected)
485500

486501
def test_get_dummies_basic_drop_first_NA(self, sparse):
487502
# Test NA handling together with drop_first
488503
s_NA = ["a", "b", np.nan]
489-
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
504+
res = get_dummies(s_NA, dtype=np.uint8, drop_first=True, sparse=sparse)
490505
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
491506
if sparse:
492507
exp = exp.apply(SparseArray, fill_value=0)
493508

494509
tm.assert_frame_equal(res, exp)
495510

496-
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
511+
res_na = get_dummies(
512+
s_NA, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
513+
)
497514
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
498515
["b", np.nan], axis=1
499516
)
@@ -502,22 +519,22 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
502519
tm.assert_frame_equal(res_na, exp_na)
503520

504521
res_just_na = get_dummies(
505-
[np.nan], dummy_na=True, drop_first=True, sparse=sparse
522+
[np.nan], dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
506523
)
507524
exp_just_na = DataFrame(index=np.arange(1))
508525
tm.assert_frame_equal(res_just_na, exp_just_na)
509526

510527
def test_dataframe_dummies_drop_first(self, df, sparse):
511528
df = df[["A", "B"]]
512-
result = get_dummies(df, drop_first=True, sparse=sparse)
529+
result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse)
513530
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
514531
if sparse:
515532
expected = expected.apply(SparseArray, fill_value=0)
516533
tm.assert_frame_equal(result, expected)
517534

518535
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
519536
df["cat"] = Categorical(["x", "y", "y"])
520-
result = get_dummies(df, drop_first=True, sparse=sparse)
537+
result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse)
521538
expected = DataFrame(
522539
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
523540
)
@@ -532,7 +549,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
532549
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
533550
df.loc[3, :] = [np.nan, np.nan, np.nan]
534551
result = get_dummies(
535-
df, dummy_na=True, drop_first=True, sparse=sparse
552+
df, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
536553
).sort_index(axis=1)
537554
expected = DataFrame(
538555
{
@@ -552,18 +569,20 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
552569

553570
tm.assert_frame_equal(result, expected)
554571

555-
result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
572+
result = get_dummies(
573+
df, dtype=np.uint8, dummy_na=False, drop_first=True, sparse=sparse
574+
)
556575
expected = expected[["C", "A_b", "B_c"]]
557576
tm.assert_frame_equal(result, expected)
558577

559578
def test_get_dummies_int_int(self):
560579
data = Series([1, 2, 1])
561-
result = get_dummies(data)
580+
result = get_dummies(data, dtype=np.uint8)
562581
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
563582
tm.assert_frame_equal(result, expected)
564583

565584
data = Series(Categorical(["a", "b", "a"]))
566-
result = get_dummies(data)
585+
result = get_dummies(data, dtype=np.uint8)
567586
expected = DataFrame(
568587
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
569588
)
@@ -605,15 +624,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
605624
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
606625
# GH18914
607626
df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
608-
df = get_dummies(df, columns=["Nation"], sparse=sparse)
627+
df = get_dummies(df, dtype=np.uint8, columns=["Nation"], sparse=sparse)
609628
df2 = df.reindex(columns=["GDP"])
610629

611630
tm.assert_frame_equal(df[["GDP"]], df2)
612631

613632
def test_get_dummies_duplicate_columns(self, df):
614633
# GH20839
615634
df.columns = ["A", "A", "A"]
616-
result = get_dummies(df).sort_index(axis=1)
635+
result = get_dummies(df, dtype=np.uint8).sort_index(axis=1)
617636

618637
expected = DataFrame(
619638
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
@@ -627,7 +646,7 @@ def test_get_dummies_duplicate_columns(self, df):
627646

628647
def test_get_dummies_all_sparse(self):
629648
df = DataFrame({"A": [1, 2]})
630-
result = get_dummies(df, columns=["A"], sparse=True)
649+
result = get_dummies(df, dtype=np.uint8, columns=["A"], sparse=True)
631650
dtype = SparseDtype("uint8", 0)
632651
expected = DataFrame(
633652
{
@@ -652,4 +671,4 @@ def test_get_dummies_with_string_values(self, values):
652671
msg = "Input must be a list-like for parameter `columns`"
653672

654673
with pytest.raises(TypeError, match=msg):
655-
get_dummies(df, columns=values)
674+
get_dummies(df, dtype=np.uint8, columns=values)

0 commit comments

Comments
 (0)