6
6
from pandas .core .dtypes .common import is_integer_dtype
7
7
8
8
import pandas as pd
9
- from pandas import Categorical , DataFrame , Index , Series , get_dummies
9
+ from pandas import Categorical , CategoricalIndex , DataFrame , Series , get_dummies
10
10
import pandas ._testing as tm
11
11
from pandas .core .arrays .sparse import SparseArray , SparseDtype
12
12
@@ -31,11 +31,11 @@ def effective_dtype(self, dtype):
31
31
return np .uint8
32
32
return dtype
33
33
34
- def test_raises_on_dtype_object (self , df ):
34
+ def test_get_dummies_raises_on_dtype_object (self , df ):
35
35
with pytest .raises (ValueError ):
36
36
get_dummies (df , dtype = "object" )
37
37
38
- def test_basic (self , sparse , dtype ):
38
+ def test_get_dummies_basic (self , sparse , dtype ):
39
39
s_list = list ("abc" )
40
40
s_series = Series (s_list )
41
41
s_series_index = Series (s_list , list ("ABC" ))
@@ -56,7 +56,7 @@ def test_basic(self, sparse, dtype):
56
56
result = get_dummies (s_series_index , sparse = sparse , dtype = dtype )
57
57
tm .assert_frame_equal (result , expected )
58
58
59
- def test_basic_types (self , sparse , dtype ):
59
+ def test_get_dummies_basic_types (self , sparse , dtype ):
60
60
# GH 10531
61
61
s_list = list ("abc" )
62
62
s_series = Series (s_list )
@@ -106,7 +106,7 @@ def test_basic_types(self, sparse, dtype):
106
106
result = result .sort_index ()
107
107
tm .assert_series_equal (result , expected )
108
108
109
- def test_just_na (self , sparse ):
109
+ def test_get_dummies_just_na (self , sparse ):
110
110
just_na_list = [np .nan ]
111
111
just_na_series = Series (just_na_list )
112
112
just_na_series_index = Series (just_na_list , index = ["A" ])
@@ -123,7 +123,7 @@ def test_just_na(self, sparse):
123
123
assert res_series .index .tolist () == [0 ]
124
124
assert res_series_index .index .tolist () == ["A" ]
125
125
126
- def test_include_na (self , sparse , dtype ):
126
+ def test_get_dummies_include_na (self , sparse , dtype ):
127
127
s = ["a" , "b" , np .nan ]
128
128
res = get_dummies (s , sparse = sparse , dtype = dtype )
129
129
exp = DataFrame (
@@ -152,7 +152,7 @@ def test_include_na(self, sparse, dtype):
152
152
)
153
153
tm .assert_numpy_array_equal (res_just_na .values , exp_just_na .values )
154
154
155
- def test_unicode (self , sparse ):
155
+ def test_get_dummies_unicode (self , sparse ):
156
156
# See GH 6885 - get_dummies chokes on unicode values
157
157
import unicodedata
158
158
@@ -175,7 +175,7 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
175
175
dtype = np .uint8 ,
176
176
)
177
177
if sparse :
178
- expected = pd . DataFrame (
178
+ expected = DataFrame (
179
179
{
180
180
"A_a" : SparseArray ([1 , 0 , 1 ], dtype = "uint8" ),
181
181
"A_b" : SparseArray ([0 , 1 , 0 ], dtype = "uint8" ),
@@ -223,7 +223,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
223
223
cols = ["from_A_a" , "from_A_b" , "from_B_b" , "from_B_c" ]
224
224
expected = expected [["C" ] + cols ]
225
225
226
- typ = SparseArray if sparse else pd . Series
226
+ typ = SparseArray if sparse else Series
227
227
expected [cols ] = expected [cols ].apply (lambda x : typ (x ))
228
228
tm .assert_frame_equal (result , expected )
229
229
@@ -242,11 +242,11 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
242
242
# https://github.com/pandas-dev/pandas/issues/14427
243
243
expected = pd .concat (
244
244
[
245
- pd . Series ([1 , 2 , 3 ], name = "C" ),
246
- pd . Series ([1 , 0 , 1 ], name = "bad_a" , dtype = "Sparse[uint8]" ),
247
- pd . Series ([0 , 1 , 0 ], name = "bad_b" , dtype = "Sparse[uint8]" ),
248
- pd . Series ([1 , 1 , 0 ], name = "bad_b" , dtype = "Sparse[uint8]" ),
249
- pd . Series ([0 , 0 , 1 ], name = "bad_c" , dtype = "Sparse[uint8]" ),
245
+ Series ([1 , 2 , 3 ], name = "C" ),
246
+ Series ([1 , 0 , 1 ], name = "bad_a" , dtype = "Sparse[uint8]" ),
247
+ Series ([0 , 1 , 0 ], name = "bad_b" , dtype = "Sparse[uint8]" ),
248
+ Series ([1 , 1 , 0 ], name = "bad_b" , dtype = "Sparse[uint8]" ),
249
+ Series ([0 , 0 , 1 ], name = "bad_c" , dtype = "Sparse[uint8]" ),
250
250
],
251
251
axis = 1 ,
252
252
)
@@ -267,7 +267,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
267
267
expected [["C" ]] = df [["C" ]]
268
268
if sparse :
269
269
cols = ["from_A_a" , "from_A_b" ]
270
- expected [cols ] = expected [cols ].astype (pd . SparseDtype ("uint8" , 0 ))
270
+ expected [cols ] = expected [cols ].astype (SparseDtype ("uint8" , 0 ))
271
271
tm .assert_frame_equal (result , expected )
272
272
273
273
def test_dataframe_dummies_prefix_sep (self , df , sparse ):
@@ -286,7 +286,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
286
286
expected = expected [["C" , "A..a" , "A..b" , "B..b" , "B..c" ]]
287
287
if sparse :
288
288
cols = ["A..a" , "A..b" , "B..b" , "B..c" ]
289
- expected [cols ] = expected [cols ].astype (pd . SparseDtype ("uint8" , 0 ))
289
+ expected [cols ] = expected [cols ].astype (SparseDtype ("uint8" , 0 ))
290
290
291
291
tm .assert_frame_equal (result , expected )
292
292
@@ -323,7 +323,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
323
323
columns = ["from_A_a" , "from_A_b" , "from_B_b" , "from_B_c" ]
324
324
expected [columns ] = expected [columns ].astype (np .uint8 )
325
325
if sparse :
326
- expected [columns ] = expected [columns ].astype (pd . SparseDtype ("uint8" , 0 ))
326
+ expected [columns ] = expected [columns ].astype (SparseDtype ("uint8" , 0 ))
327
327
328
328
tm .assert_frame_equal (result , expected )
329
329
@@ -359,7 +359,7 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype):
359
359
tm .assert_frame_equal (result , expected )
360
360
361
361
def test_dataframe_dummies_with_categorical (self , df , sparse , dtype ):
362
- df ["cat" ] = pd . Categorical (["x" , "y" , "y" ])
362
+ df ["cat" ] = Categorical (["x" , "y" , "y" ])
363
363
result = get_dummies (df , sparse = sparse , dtype = dtype ).sort_index (axis = 1 )
364
364
if sparse :
365
365
arr = SparseArray
@@ -386,30 +386,30 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
386
386
"get_dummies_kwargs,expected" ,
387
387
[
388
388
(
389
- {"data" : pd . DataFrame (({"ä" : ["a" ]}))},
390
- pd . DataFrame ({"ä_a" : [1 ]}, dtype = np .uint8 ),
389
+ {"data" : DataFrame (({"ä" : ["a" ]}))},
390
+ DataFrame ({"ä_a" : [1 ]}, dtype = np .uint8 ),
391
391
),
392
392
(
393
- {"data" : pd . DataFrame ({"x" : ["ä" ]})},
394
- pd . DataFrame ({"x_ä" : [1 ]}, dtype = np .uint8 ),
393
+ {"data" : DataFrame ({"x" : ["ä" ]})},
394
+ DataFrame ({"x_ä" : [1 ]}, dtype = np .uint8 ),
395
395
),
396
396
(
397
- {"data" : pd . DataFrame ({"x" : ["a" ]}), "prefix" : "ä" },
398
- pd . DataFrame ({"ä_a" : [1 ]}, dtype = np .uint8 ),
397
+ {"data" : DataFrame ({"x" : ["a" ]}), "prefix" : "ä" },
398
+ DataFrame ({"ä_a" : [1 ]}, dtype = np .uint8 ),
399
399
),
400
400
(
401
- {"data" : pd . DataFrame ({"x" : ["a" ]}), "prefix_sep" : "ä" },
402
- pd . DataFrame ({"xäa" : [1 ]}, dtype = np .uint8 ),
401
+ {"data" : DataFrame ({"x" : ["a" ]}), "prefix_sep" : "ä" },
402
+ DataFrame ({"xäa" : [1 ]}, dtype = np .uint8 ),
403
403
),
404
404
],
405
405
)
406
406
def test_dataframe_dummies_unicode (self , get_dummies_kwargs , expected ):
407
- # GH22084 pd. get_dummies incorrectly encodes unicode characters
407
+ # GH22084 get_dummies incorrectly encodes unicode characters
408
408
# in dataframe column names
409
409
result = get_dummies (** get_dummies_kwargs )
410
410
tm .assert_frame_equal (result , expected )
411
411
412
- def test_basic_drop_first (self , sparse ):
412
+ def test_get_dummies_basic_drop_first (self , sparse ):
413
413
# GH12402 Add a new parameter `drop_first` to avoid collinearity
414
414
# Basic case
415
415
s_list = list ("abc" )
@@ -430,7 +430,7 @@ def test_basic_drop_first(self, sparse):
430
430
result = get_dummies (s_series_index , drop_first = True , sparse = sparse )
431
431
tm .assert_frame_equal (result , expected )
432
432
433
- def test_basic_drop_first_one_level (self , sparse ):
433
+ def test_get_dummies_basic_drop_first_one_level (self , sparse ):
434
434
# Test the case that categorical variable only has one level.
435
435
s_list = list ("aaa" )
436
436
s_series = Series (s_list )
@@ -448,7 +448,7 @@ def test_basic_drop_first_one_level(self, sparse):
448
448
result = get_dummies (s_series_index , drop_first = True , sparse = sparse )
449
449
tm .assert_frame_equal (result , expected )
450
450
451
- def test_basic_drop_first_NA (self , sparse ):
451
+ def test_get_dummies_basic_drop_first_NA (self , sparse ):
452
452
# Test NA handling together with drop_first
453
453
s_NA = ["a" , "b" , np .nan ]
454
454
res = get_dummies (s_NA , drop_first = True , sparse = sparse )
@@ -481,7 +481,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
481
481
tm .assert_frame_equal (result , expected )
482
482
483
483
def test_dataframe_dummies_drop_first_with_categorical (self , df , sparse , dtype ):
484
- df ["cat" ] = pd . Categorical (["x" , "y" , "y" ])
484
+ df ["cat" ] = Categorical (["x" , "y" , "y" ])
485
485
result = get_dummies (df , drop_first = True , sparse = sparse )
486
486
expected = DataFrame (
487
487
{"C" : [1 , 2 , 3 ], "A_b" : [0 , 1 , 0 ], "B_c" : [0 , 0 , 1 ], "cat_y" : [0 , 1 , 1 ]}
@@ -521,24 +521,24 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
521
521
expected = expected [["C" , "A_b" , "B_c" ]]
522
522
tm .assert_frame_equal (result , expected )
523
523
524
- def test_int_int (self ):
524
+ def test_get_dummies_int_int (self ):
525
525
data = Series ([1 , 2 , 1 ])
526
- result = pd . get_dummies (data )
526
+ result = get_dummies (data )
527
527
expected = DataFrame ([[1 , 0 ], [0 , 1 ], [1 , 0 ]], columns = [1 , 2 ], dtype = np .uint8 )
528
528
tm .assert_frame_equal (result , expected )
529
529
530
- data = Series (pd . Categorical (["a" , "b" , "a" ]))
531
- result = pd . get_dummies (data )
530
+ data = Series (Categorical (["a" , "b" , "a" ]))
531
+ result = get_dummies (data )
532
532
expected = DataFrame (
533
- [[1 , 0 ], [0 , 1 ], [1 , 0 ]], columns = pd . Categorical (["a" , "b" ]), dtype = np .uint8
533
+ [[1 , 0 ], [0 , 1 ], [1 , 0 ]], columns = Categorical (["a" , "b" ]), dtype = np .uint8
534
534
)
535
535
tm .assert_frame_equal (result , expected )
536
536
537
- def test_int_df (self , dtype ):
537
+ def test_get_dummies_int_df (self , dtype ):
538
538
data = DataFrame (
539
539
{
540
540
"A" : [1 , 2 , 1 ],
541
- "B" : pd . Categorical (["a" , "b" , "a" ]),
541
+ "B" : Categorical (["a" , "b" , "a" ]),
542
542
"C" : [1 , 2 , 1 ],
543
543
"D" : [1.0 , 2.0 , 1.0 ],
544
544
}
@@ -549,22 +549,22 @@ def test_int_df(self, dtype):
549
549
columns = columns ,
550
550
)
551
551
expected [columns [2 :]] = expected [columns [2 :]].astype (dtype )
552
- result = pd . get_dummies (data , columns = ["A" , "B" ], dtype = dtype )
552
+ result = get_dummies (data , columns = ["A" , "B" ], dtype = dtype )
553
553
tm .assert_frame_equal (result , expected )
554
554
555
- def test_dataframe_dummies_preserve_categorical_dtype (self , dtype ):
555
+ @pytest .mark .parametrize ("ordered" , [True , False ])
556
+ def test_dataframe_dummies_preserve_categorical_dtype (self , dtype , ordered ):
556
557
# GH13854
557
- for ordered in [False , True ]:
558
- cat = pd .Categorical (list ("xy" ), categories = list ("xyz" ), ordered = ordered )
559
- result = get_dummies (cat , dtype = dtype )
558
+ cat = Categorical (list ("xy" ), categories = list ("xyz" ), ordered = ordered )
559
+ result = get_dummies (cat , dtype = dtype )
560
560
561
- data = np .array ([[1 , 0 , 0 ], [0 , 1 , 0 ]], dtype = self .effective_dtype (dtype ))
562
- cols = pd . CategoricalIndex (
563
- cat .categories , categories = cat .categories , ordered = ordered
564
- )
565
- expected = DataFrame (data , columns = cols , dtype = self .effective_dtype (dtype ))
561
+ data = np .array ([[1 , 0 , 0 ], [0 , 1 , 0 ]], dtype = self .effective_dtype (dtype ))
562
+ cols = CategoricalIndex (
563
+ cat .categories , categories = cat .categories , ordered = ordered
564
+ )
565
+ expected = DataFrame (data , columns = cols , dtype = self .effective_dtype (dtype ))
566
566
567
- tm .assert_frame_equal (result , expected )
567
+ tm .assert_frame_equal (result , expected )
568
568
569
569
@pytest .mark .parametrize ("sparse" , [True , False ])
570
570
def test_get_dummies_dont_sparsify_all_columns (self , sparse ):
@@ -593,10 +593,10 @@ def test_get_dummies_duplicate_columns(self, df):
593
593
tm .assert_frame_equal (result , expected )
594
594
595
595
def test_get_dummies_all_sparse (self ):
596
- df = pd . DataFrame ({"A" : [1 , 2 ]})
597
- result = pd . get_dummies (df , columns = ["A" ], sparse = True )
596
+ df = DataFrame ({"A" : [1 , 2 ]})
597
+ result = get_dummies (df , columns = ["A" ], sparse = True )
598
598
dtype = SparseDtype ("uint8" , 0 )
599
- expected = pd . DataFrame (
599
+ expected = DataFrame (
600
600
{
601
601
"A_1" : SparseArray ([1 , 0 ], dtype = dtype ),
602
602
"A_2" : SparseArray ([0 , 1 ], dtype = dtype ),
@@ -607,7 +607,7 @@ def test_get_dummies_all_sparse(self):
607
607
@pytest .mark .parametrize ("values" , ["baz" ])
608
608
def test_get_dummies_with_string_values (self , values ):
609
609
# issue #28383
610
- df = pd . DataFrame (
610
+ df = DataFrame (
611
611
{
612
612
"bar" : [1 , 2 , 3 , 4 , 5 , 6 ],
613
613
"foo" : ["one" , "one" , "one" , "two" , "two" , "two" ],
@@ -619,26 +619,4 @@ def test_get_dummies_with_string_values(self, values):
619
619
msg = "Input must be a list-like for parameter `columns`"
620
620
621
621
with pytest .raises (TypeError , match = msg ):
622
- pd .get_dummies (df , columns = values )
623
-
624
-
625
- class TestCategoricalReshape :
626
- def test_reshaping_multi_index_categorical (self ):
627
-
628
- cols = ["ItemA" , "ItemB" , "ItemC" ]
629
- data = {c : tm .makeTimeDataFrame () for c in cols }
630
- df = pd .concat ({c : data [c ].stack () for c in data }, axis = "columns" )
631
- df .index .names = ["major" , "minor" ]
632
- df ["str" ] = "foo"
633
-
634
- df ["category" ] = df ["str" ].astype ("category" )
635
- result = df ["category" ].unstack ()
636
-
637
- dti = df .index .levels [0 ]
638
- c = Categorical (["foo" ] * len (dti ))
639
- expected = DataFrame (
640
- {"A" : c .copy (), "B" : c .copy (), "C" : c .copy (), "D" : c .copy ()},
641
- columns = Index (list ("ABCD" ), name = "minor" ),
642
- index = dti .rename ("major" ),
643
- )
644
- tm .assert_frame_equal (result , expected )
622
+ get_dummies (df , columns = values )
0 commit comments