@@ -358,8 +358,8 @@ def test_extract_single_group_returns_frame():
358
358
tm .assert_frame_equal (r , e )
359
359
360
360
361
- def test_extractall ():
362
- subject_list = [
361
+ def test_extractall (any_string_dtype ):
362
+ data = [
363
363
364
364
365
365
@@ -378,28 +378,30 @@ def test_extractall():
378
378
("c" , "d" , "com" ),
379
379
("e" , "f" , "com" ),
380
380
]
381
- named_pattern = r"""
381
+ pat = r"""
382
382
(?P<user>[a-z0-9]+)
383
383
@
384
384
(?P<domain>[a-z]+)
385
385
\.
386
386
(?P<tld>[a-z]{2,4})
387
387
"""
388
388
expected_columns = ["user" , "domain" , "tld" ]
389
- S = Series (subject_list )
390
- # extractall should return a DataFrame with one row for each
391
- # match, indexed by the subject from which the match came.
389
+ s = Series (data , dtype = any_string_dtype )
390
+ # extractall should return a DataFrame with one row for each match, indexed by the
391
+ # subject from which the match came.
392
392
expected_index = MultiIndex .from_tuples (
393
393
[(0 , 0 ), (1 , 0 ), (2 , 0 ), (3 , 0 ), (3 , 1 ), (4 , 0 ), (4 , 1 ), (4 , 2 )],
394
394
names = (None , "match" ),
395
395
)
396
- expected_df = DataFrame (expected_tuples , expected_index , expected_columns )
397
- computed_df = S .str .extractall (named_pattern , re .VERBOSE )
398
- tm .assert_frame_equal (computed_df , expected_df )
396
+ expected = DataFrame (
397
+ expected_tuples , expected_index , expected_columns , dtype = any_string_dtype
398
+ )
399
+ result = s .str .extractall (pat , flags = re .VERBOSE )
400
+ tm .assert_frame_equal (result , expected )
399
401
400
- # The index of the input Series should be used to construct
401
- # the index of the output DataFrame:
402
- series_index = MultiIndex .from_tuples (
402
+ # The index of the input Series should be used to construct the index of the output
403
+ # DataFrame:
404
+ mi = MultiIndex .from_tuples (
403
405
[
404
406
("single" , "Dave" ),
405
407
("single" , "Toby" ),
@@ -410,7 +412,7 @@ def test_extractall():
410
412
("none" , "empty" ),
411
413
]
412
414
)
413
- Si = Series (subject_list , series_index )
415
+ s = Series (data , index = mi , dtype = any_string_dtype )
414
416
expected_index = MultiIndex .from_tuples (
415
417
[
416
418
("single" , "Dave" , 0 ),
@@ -424,67 +426,80 @@ def test_extractall():
424
426
],
425
427
names = (None , None , "match" ),
426
428
)
427
- expected_df = DataFrame (expected_tuples , expected_index , expected_columns )
428
- computed_df = Si .str .extractall (named_pattern , re .VERBOSE )
429
- tm .assert_frame_equal (computed_df , expected_df )
429
+ expected = DataFrame (
430
+ expected_tuples , expected_index , expected_columns , dtype = any_string_dtype
431
+ )
432
+ result = s .str .extractall (pat , flags = re .VERBOSE )
433
+ tm .assert_frame_equal (result , expected )
430
434
431
435
# MultiIndexed subject with names.
432
- Sn = Series (subject_list , series_index )
433
- Sn .index .names = ("matches" , "description" )
436
+ s = Series (data , index = mi , dtype = any_string_dtype )
437
+ s .index .names = ("matches" , "description" )
434
438
expected_index .names = ("matches" , "description" , "match" )
435
- expected_df = DataFrame (expected_tuples , expected_index , expected_columns )
436
- computed_df = Sn .str .extractall (named_pattern , re .VERBOSE )
437
- tm .assert_frame_equal (computed_df , expected_df )
438
-
439
- # optional groups.
440
- subject_list = ["" , "A1" , "32" ]
441
- named_pattern = "(?P<letter>[AB])?(?P<number>[123])"
442
- computed_df = Series (subject_list ).str .extractall (named_pattern )
443
- expected_index = MultiIndex .from_tuples (
444
- [(1 , 0 ), (2 , 0 ), (2 , 1 )], names = (None , "match" )
445
- )
446
- expected_df = DataFrame (
447
- [("A" , "1" ), (np .nan , "3" ), (np .nan , "2" )],
448
- expected_index ,
449
- columns = ["letter" , "number" ],
439
+ expected = DataFrame (
440
+ expected_tuples , expected_index , expected_columns , dtype = any_string_dtype
450
441
)
451
- tm .assert_frame_equal (computed_df , expected_df )
442
+ result = s .str .extractall (pat , flags = re .VERBOSE )
443
+ tm .assert_frame_equal (result , expected )
444
+
445
+
446
+ @pytest .mark .parametrize (
447
+ "pat,expected_names" ,
448
+ [
449
+ # optional groups.
450
+ ("(?P<letter>[AB])?(?P<number>[123])" , ["letter" , "number" ]),
451
+ # only one of two groups has a name.
452
+ ("([AB])?(?P<number>[123])" , [0 , "number" ]),
453
+ ],
454
+ )
455
+ def test_extractall_column_names (pat , expected_names , any_string_dtype ):
456
+ s = Series (["" , "A1" , "32" ], dtype = any_string_dtype )
452
457
453
- # only one of two groups has a name.
454
- pattern = "([AB])?(?P<number>[123])"
455
- computed_df = Series (subject_list ).str .extractall (pattern )
456
- expected_df = DataFrame (
458
+ result = s .str .extractall (pat )
459
+ expected = DataFrame (
457
460
[("A" , "1" ), (np .nan , "3" ), (np .nan , "2" )],
458
- expected_index ,
459
- columns = [0 , "number" ],
461
+ index = MultiIndex .from_tuples ([(1 , 0 ), (2 , 0 ), (2 , 1 )], names = (None , "match" )),
462
+ columns = expected_names ,
463
+ dtype = any_string_dtype ,
460
464
)
461
- tm .assert_frame_equal (computed_df , expected_df )
465
+ tm .assert_frame_equal (result , expected )
462
466
463
467
464
- def test_extractall_single_group ():
465
- # extractall(one named group) returns DataFrame with one named
466
- # column.
467
- s = Series (["a3" , "b3" , "d4c2" ], name = "series_name" )
468
- r = s .str .extractall (r"(?P<letter>[a-z])" )
469
- i = MultiIndex .from_tuples ([(0 , 0 ), (1 , 0 ), (2 , 0 ), (2 , 1 )], names = (None , "match" ))
470
- e = DataFrame ({"letter" : ["a" , "b" , "d" , "c" ]}, i )
471
- tm .assert_frame_equal (r , e )
468
+ def test_extractall_single_group (any_string_dtype ):
469
+ s = Series (["a3" , "b3" , "d4c2" ], name = "series_name" , dtype = any_string_dtype )
470
+ expected_index = MultiIndex .from_tuples (
471
+ [(0 , 0 ), (1 , 0 ), (2 , 0 ), (2 , 1 )], names = (None , "match" )
472
+ )
472
473
473
- # extractall(one un-named group) returns DataFrame with one
474
- # un-named column.
475
- r = s .str .extractall (r"([a-z])" )
476
- e = DataFrame (["a" , "b" , "d" , "c" ], i )
477
- tm .assert_frame_equal (r , e )
474
+ # extractall(one named group) returns DataFrame with one named column.
475
+ result = s .str .extractall (r"(?P<letter>[a-z])" )
476
+ expected = DataFrame (
477
+ {"letter" : ["a" , "b" , "d" , "c" ]}, index = expected_index , dtype = any_string_dtype
478
+ )
479
+ tm .assert_frame_equal (result , expected )
480
+
481
+ # extractall(one un-named group) returns DataFrame with one un-named column.
482
+ result = s .str .extractall (r"([a-z])" )
483
+ expected = DataFrame (
484
+ ["a" , "b" , "d" , "c" ], index = expected_index , dtype = any_string_dtype
485
+ )
486
+ tm .assert_frame_equal (result , expected )
478
487
479
488
480
- def test_extractall_single_group_with_quantifier ():
481
- # extractall(one un-named group with quantifier) returns
482
- # DataFrame with one un-named column (GH13382).
483
- s = Series (["ab3" , "abc3" , "d4cd2" ], name = "series_name" )
484
- r = s .str .extractall (r"([a-z]+)" )
485
- i = MultiIndex .from_tuples ([(0 , 0 ), (1 , 0 ), (2 , 0 ), (2 , 1 )], names = (None , "match" ))
486
- e = DataFrame (["ab" , "abc" , "d" , "cd" ], i )
487
- tm .assert_frame_equal (r , e )
489
+ def test_extractall_single_group_with_quantifier (any_string_dtype ):
490
+ # GH#13382
491
+ # extractall(one un-named group with quantifier) returns DataFrame with one un-named
492
+ # column.
493
+ s = Series (["ab3" , "abc3" , "d4cd2" ], name = "series_name" , dtype = any_string_dtype )
494
+ result = s .str .extractall (r"([a-z]+)" )
495
+ expected = DataFrame (
496
+ ["ab" , "abc" , "d" , "cd" ],
497
+ index = MultiIndex .from_tuples (
498
+ [(0 , 0 ), (1 , 0 ), (2 , 0 ), (2 , 1 )], names = (None , "match" )
499
+ ),
500
+ dtype = any_string_dtype ,
501
+ )
502
+ tm .assert_frame_equal (result , expected )
488
503
489
504
490
505
@pytest .mark .parametrize (
@@ -500,78 +515,91 @@ def test_extractall_single_group_with_quantifier():
500
515
(["a3" , "b3" , "d4c2" ], ("i1" , "i2" )),
501
516
],
502
517
)
503
- def test_extractall_no_matches (data , names ):
518
+ def test_extractall_no_matches (data , names , any_string_dtype ):
504
519
# GH19075 extractall with no matches should return a valid MultiIndex
505
520
n = len (data )
506
521
if len (names ) == 1 :
507
- i = Index (range (n ), name = names [0 ])
522
+ index = Index (range (n ), name = names [0 ])
508
523
else :
509
- a = (tuple ([i ] * (n - 1 )) for i in range (n ))
510
- i = MultiIndex .from_tuples (a , names = names )
511
- s = Series (data , name = "series_name" , index = i , dtype = "object" )
512
- ei = MultiIndex .from_tuples ([], names = (names + ("match" ,)))
524
+ tuples = (tuple ([i ] * (n - 1 )) for i in range (n ))
525
+ index = MultiIndex .from_tuples (tuples , names = names )
526
+ s = Series (data , name = "series_name" , index = index , dtype = any_string_dtype )
527
+ expected_index = MultiIndex .from_tuples ([], names = (names + ("match" ,)))
513
528
514
529
# one un-named group.
515
- r = s .str .extractall ("(z)" )
516
- e = DataFrame (columns = [0 ], index = ei )
517
- tm .assert_frame_equal (r , e )
530
+ result = s .str .extractall ("(z)" )
531
+ expected = DataFrame (columns = [0 ], index = expected_index , dtype = any_string_dtype )
532
+ tm .assert_frame_equal (result , expected )
518
533
519
534
# two un-named groups.
520
- r = s .str .extractall ("(z)(z)" )
521
- e = DataFrame (columns = [0 , 1 ], index = ei )
522
- tm .assert_frame_equal (r , e )
535
+ result = s .str .extractall ("(z)(z)" )
536
+ expected = DataFrame (columns = [0 , 1 ], index = expected_index , dtype = any_string_dtype )
537
+ tm .assert_frame_equal (result , expected )
523
538
524
539
# one named group.
525
- r = s .str .extractall ("(?P<first>z)" )
526
- e = DataFrame (columns = ["first" ], index = ei )
527
- tm .assert_frame_equal (r , e )
540
+ result = s .str .extractall ("(?P<first>z)" )
541
+ expected = DataFrame (
542
+ columns = ["first" ], index = expected_index , dtype = any_string_dtype
543
+ )
544
+ tm .assert_frame_equal (result , expected )
528
545
529
546
# two named groups.
530
- r = s .str .extractall ("(?P<first>z)(?P<second>z)" )
531
- e = DataFrame (columns = ["first" , "second" ], index = ei )
532
- tm .assert_frame_equal (r , e )
547
+ result = s .str .extractall ("(?P<first>z)(?P<second>z)" )
548
+ expected = DataFrame (
549
+ columns = ["first" , "second" ], index = expected_index , dtype = any_string_dtype
550
+ )
551
+ tm .assert_frame_equal (result , expected )
533
552
534
553
# one named, one un-named.
535
- r = s .str .extractall ("(z)(?P<second>z)" )
536
- e = DataFrame (columns = [0 , "second" ], index = ei )
537
- tm .assert_frame_equal (r , e )
554
+ result = s .str .extractall ("(z)(?P<second>z)" )
555
+ expected = DataFrame (
556
+ columns = [0 , "second" ], index = expected_index , dtype = any_string_dtype
557
+ )
558
+ tm .assert_frame_equal (result , expected )
538
559
539
560
540
- def test_extractall_stringindex ():
541
- s = Series (["a1a2" , "b1" , "c1" ], name = "xxx" )
542
- res = s .str .extractall (r"[ab](?P<digit>\d)" )
543
- exp_idx = MultiIndex .from_tuples ([(0 , 0 ), (0 , 1 ), (1 , 0 )], names = [None , "match" ])
544
- exp = DataFrame ({"digit" : ["1" , "2" , "1" ]}, index = exp_idx )
545
- tm .assert_frame_equal (res , exp )
561
+ def test_extractall_stringindex (any_string_dtype ):
562
+ s = Series (["a1a2" , "b1" , "c1" ], name = "xxx" , dtype = any_string_dtype )
563
+ result = s .str .extractall (r"[ab](?P<digit>\d)" )
564
+ expected = DataFrame (
565
+ {"digit" : ["1" , "2" , "1" ]},
566
+ index = MultiIndex .from_tuples ([(0 , 0 ), (0 , 1 ), (1 , 0 )], names = [None , "match" ]),
567
+ dtype = any_string_dtype ,
568
+ )
569
+ tm .assert_frame_equal (result , expected )
546
570
547
- # index should return the same result as the default index without name
548
- # thus index.name doesn't affect to the result
549
- for idx in [
550
- Index (["a1a2" , "b1" , "c1" ]),
551
- Index (["a1a2" , "b1" , "c1" ], name = "xxx" ),
552
- ]:
571
+ # index should return the same result as the default index without name thus
572
+ # index.name doesn't affect to the result
573
+ if any_string_dtype == "object" :
574
+ for idx in [
575
+ Index (["a1a2" , "b1" , "c1" ]),
576
+ Index (["a1a2" , "b1" , "c1" ], name = "xxx" ),
577
+ ]:
553
578
554
- res = idx .str .extractall (r"[ab](?P<digit>\d)" )
555
- tm .assert_frame_equal (res , exp )
579
+ result = idx .str .extractall (r"[ab](?P<digit>\d)" )
580
+ tm .assert_frame_equal (result , expected )
556
581
557
582
s = Series (
558
583
["a1a2" , "b1" , "c1" ],
559
584
name = "s_name" ,
560
585
index = Index (["XX" , "yy" , "zz" ], name = "idx_name" ),
586
+ dtype = any_string_dtype ,
561
587
)
562
- res = s .str .extractall (r"[ab](?P<digit>\d)" )
563
- exp_idx = MultiIndex .from_tuples (
564
- [("XX" , 0 ), ("XX" , 1 ), ("yy" , 0 )], names = ["idx_name" , "match" ]
588
+ result = s .str .extractall (r"[ab](?P<digit>\d)" )
589
+ expected = DataFrame (
590
+ {"digit" : ["1" , "2" , "1" ]},
591
+ index = MultiIndex .from_tuples (
592
+ [("XX" , 0 ), ("XX" , 1 ), ("yy" , 0 )], names = ["idx_name" , "match" ]
593
+ ),
594
+ dtype = any_string_dtype ,
565
595
)
566
- exp = DataFrame ({"digit" : ["1" , "2" , "1" ]}, index = exp_idx )
567
- tm .assert_frame_equal (res , exp )
596
+ tm .assert_frame_equal (result , expected )
568
597
569
598
570
- def test_extractall_errors ():
571
- # Does not make sense to use extractall with a regex that has
572
- # no capture groups. (it returns DataFrame with one column for
573
- # each capture group)
574
- s = Series (["a3" , "b3" , "d4c2" ], name = "series_name" )
599
+ def test_extractall_no_capture_groups_raises (any_string_dtype ):
600
+ # Does not make sense to use extractall with a regex that has no capture groups.
601
+ # (it returns DataFrame with one column for each capture group)
602
+ s = Series (["a3" , "b3" , "d4c2" ], name = "series_name" , dtype = any_string_dtype )
575
603
with pytest .raises (ValueError , match = "no capture groups" ):
576
604
s .str .extractall (r"[a-z]" )
577
605
@@ -591,8 +619,8 @@ def test_extract_index_one_two_groups():
591
619
tm .assert_frame_equal (r , e )
592
620
593
621
594
- def test_extractall_same_as_extract ():
595
- s = Series (["a3" , "b3" , "c2" ], name = "series_name" )
622
+ def test_extractall_same_as_extract (any_string_dtype ):
623
+ s = Series (["a3" , "b3" , "c2" ], name = "series_name" , dtype = any_string_dtype )
596
624
597
625
pattern_two_noname = r"([a-z])([0-9])"
598
626
extract_two_noname = s .str .extract (pattern_two_noname , expand = True )
@@ -619,13 +647,13 @@ def test_extractall_same_as_extract():
619
647
tm .assert_frame_equal (extract_one_noname , no_multi_index )
620
648
621
649
622
- def test_extractall_same_as_extract_subject_index ():
650
+ def test_extractall_same_as_extract_subject_index (any_string_dtype ):
623
651
# same as above tests, but s has an MultiIndex.
624
- i = MultiIndex .from_tuples (
652
+ mi = MultiIndex .from_tuples (
625
653
[("A" , "first" ), ("B" , "second" ), ("C" , "third" )],
626
654
names = ("capital" , "ordinal" ),
627
655
)
628
- s = Series (["a3" , "b3" , "c2" ], i , name = "series_name" )
656
+ s = Series (["a3" , "b3" , "c2" ], index = mi , name = "series_name" , dtype = any_string_dtype )
629
657
630
658
pattern_two_noname = r"([a-z])([0-9])"
631
659
extract_two_noname = s .str .extract (pattern_two_noname , expand = True )
0 commit comments