-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Add dtype argument to StringMethods get_dummies() #59577
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
e6f9527
dafb61d
bb79ef2
24be84f
09b2fad
50ed90c
9e95485
9a47768
0c94bff
9702bf7
8793516
bad1038
163fe09
3d75fdc
d68bece
c2aa7d5
0fd2401
920c865
800f787
d8149e6
6cbc3e8
532e139
cd5c2ab
822b3f4
ba05a8d
37dddb8
6fbe183
87a1ee8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,9 +51,12 @@ | |
from collections.abc import ( | ||
Callable, | ||
Hashable, | ||
Iterable, | ||
Iterator, | ||
) | ||
|
||
from pandas._typing import NpDtype | ||
|
||
from pandas import ( | ||
DataFrame, | ||
Index, | ||
|
@@ -2398,7 +2401,14 @@ def wrap( | |
return self._wrap_result(result) | ||
|
||
@forbid_nonstring_types(["bytes"]) | ||
def get_dummies(self, sep: str = "|"): | ||
def get_dummies( | ||
self, | ||
sep: str = "|", | ||
prefix: str | Iterable[str] | dict[str, str] | None = None, | ||
prefix_sep: str = "_", | ||
dummy_na: bool = False, | ||
dtype: NpDtype | None = None, | ||
): | ||
""" | ||
Return DataFrame of dummy/indicator variables for Series. | ||
|
||
|
@@ -2409,6 +2419,17 @@ def get_dummies(self, sep: str = "|"): | |
---------- | ||
sep : str, default "|" | ||
String to split on. | ||
prefix : str, list of str, or dict of str, default None | ||
String to append DataFrame column names. | ||
Pass a list with length equal to the number of columns | ||
when calling get_dummies on a DataFrame. Alternatively, `prefix` | ||
can be a dictionary mapping column names to prefixes. | ||
prefix_sep : str, default '_' | ||
If appending prefix, separator/delimiter to use. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't users just add the separator to their prefix, e.g. |
||
dummy_na : bool, default False | ||
Add a column to indicate NaNs, if False NaNs are ignored. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems to me users can already do this in a straightforward manner:
Is this not sufficient? |
||
dtype : dtype, default np.int64 | ||
Data type for new columns. Only a single dtype is allowed. | ||
|
||
Returns | ||
------- | ||
|
@@ -2433,10 +2454,58 @@ def get_dummies(self, sep: str = "|"): | |
0 1 1 0 | ||
1 0 0 0 | ||
2 1 0 1 | ||
|
||
>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dummy_na=True) | ||
a b c NaN | ||
0 1 1 0 0 | ||
1 0 0 0 1 | ||
2 1 0 1 0 | ||
|
||
>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(prefix="prefix") | ||
prefix_a prefix_b prefix_c | ||
0 1 1 0 | ||
1 0 0 0 | ||
2 1 0 1 | ||
|
||
>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies( | ||
... prefix={"a": "alpha", "b": "beta", "c": "gamma"} | ||
... ) | ||
alpha_a beta_b gamma_c | ||
0 1 1 0 | ||
1 0 0 0 | ||
2 1 0 1 | ||
|
||
>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) | ||
a b c | ||
0 True True False | ||
1 False False False | ||
2 True False True | ||
""" | ||
# we need to cast to Series of strings as only that has all | ||
# methods available for making the dummies... | ||
result, name = self._data.array._str_get_dummies(sep) | ||
result, name = self._data.array._str_get_dummies(sep, dummy_na, dtype) | ||
name = [np.nan if x == "NaN" else x for x in name] | ||
if isinstance(prefix, str): | ||
name = [f"{prefix}{prefix_sep}{col}" for col in name] | ||
elif isinstance(prefix, dict): | ||
if len(prefix) != len(name): | ||
len_msg = ( | ||
f"Length of 'prefix' ({len(prefix)}) did not match the " | ||
"length of the columns being encoded " | ||
f"({len(name)})." | ||
) | ||
raise ValueError(len_msg) | ||
name = [f"{prefix[col]}{prefix_sep}{col}" for col in name] | ||
elif isinstance(prefix, list): | ||
if len(prefix) != len(name): | ||
len_msg = ( | ||
f"Length of 'prefix' ({len(prefix)}) did not match the " | ||
"length of the columns being encoded " | ||
f"({len(name)})." | ||
) | ||
raise ValueError(len_msg) | ||
name = [f"{prefix[i]}{prefix_sep}{col}" for i, col in enumerate(name)] | ||
|
||
return self._wrap_result( | ||
result, | ||
name=name, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't users just prefix the columns by calling
.rename
after the call to get_dummies?