-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Series.str.get_dummies() raise on string type (follow up to PR #59577) #59786
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 42 commits
e6f9527
dafb61d
bb79ef2
24be84f
09b2fad
50ed90c
9e95485
9a47768
0c94bff
9702bf7
8793516
bad1038
163fe09
3d75fdc
d68bece
c2aa7d5
0fd2401
920c865
800f787
d8149e6
6cbc3e8
532e139
cd5c2ab
822b3f4
ba05a8d
37dddb8
6fbe183
87a1ee8
8706af6
9d7d7f8
fa41092
3d20d2b
392782c
870457e
7ec0b5f
8e18ee0
798a8ea
3ac320e
fbdddbb
8eec58e
a912bd8
bd8c059
961eb6c
151316d
f829533
171d381
8b26e8c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2482,12 +2482,16 @@ def get_dummies( | |
1 False False False | ||
2 True False True | ||
""" | ||
from pandas.core.dtypes.common import is_string_dtype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
from pandas.core.frame import DataFrame | ||
|
||
if is_string_dtype(dtype): | ||
raise ValueError("string dtype not supported, please use a numeric dtype") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if instead of explicitly disallowing specifically string dtype, should we allow any numeric dtype? (so raise an error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rhshadrach thought here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed that we can restrict to just numeric here. |
||
# we need to cast to Series of strings as only that has all | ||
# methods available for making the dummies... | ||
result, name = self._data.array._str_get_dummies(sep, dtype) | ||
if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): | ||
if is_extension_array_dtype(dtype): | ||
return self._wrap_result( | ||
DataFrame(result, columns=name, dtype=dtype), | ||
name=name, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -421,7 +421,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): | |
dummies_dtype = _dtype | ||
else: | ||
dummies_dtype = np.bool_ | ||
dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) | ||
dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @aaronchucarroll @rhshadrach do you remember why this change was included? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I see that is based on a comment from @mroeschke at #59577 (comment) |
||
|
||
def _isin(test_elements: str, element: str) -> bool: | ||
return element in test_elements | ||
|
rhshadrach marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,6 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from pandas._config import using_string_dtype | ||
|
||
import pandas.util._test_decorators as td | ||
|
||
from pandas import ( | ||
|
@@ -13,11 +11,6 @@ | |
_testing as tm, | ||
) | ||
|
||
try: | ||
import pyarrow as pa | ||
except ImportError: | ||
pa = None | ||
|
||
|
||
def test_get_dummies(any_string_dtype): | ||
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) | ||
|
@@ -98,30 +91,19 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): | |
|
||
|
||
# GH#47872 | ||
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") | ||
def test_get_dummies_with_str_dtype(any_string_dtype): | ||
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) | ||
result = s.str.get_dummies("|", dtype=str) | ||
expected = DataFrame( | ||
[["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], | ||
columns=list("abc"), | ||
dtype=str, | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
with pytest.raises( | ||
ValueError, match="string dtype not supported, please use a numeric dtype" | ||
): | ||
s.str.get_dummies("|", dtype=str) | ||
|
||
|
||
# GH#47872 | ||
@td.skip_if_no("pyarrow") | ||
def test_get_dummies_with_pa_str_dtype(any_string_dtype): | ||
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) | ||
result = s.str.get_dummies("|", dtype="str[pyarrow]") | ||
expected = DataFrame( | ||
[ | ||
["true", "true", "false"], | ||
["true", "false", "true"], | ||
["false", "false", "false"], | ||
], | ||
columns=list("abc"), | ||
dtype="str[pyarrow]", | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
with pytest.raises( | ||
ValueError, match="string dtype not supported, please use a numeric dtype" | ||
): | ||
s.str.get_dummies("|", dtype="str[pyarrow]") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the test above is sufficient (so keep |
Uh oh!
There was an error while loading. Please reload this page.