pandas-dev · rhshadrach · Jan 22, 2025 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -54,9 +54,8 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
-- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
+- :meth:`Series.str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
-- :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
 - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2409,8 +2409,6 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         else:
             dummies_dtype = np.bool_
         dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
-        if dtype == str:
-            dummies[:] = False
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
         result = type(self)(pa.array(list(dummies)))

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -2482,12 +2482,16 @@ def get_dummies(
         1   False  False   False
         2   True   False   True
         """
+        from pandas.core.dtypes.common import is_string_dtype
+
         from pandas.core.frame import DataFrame
 
+        if is_string_dtype(dtype):
+            raise ValueError("string dtype not supported, please use a numeric dtype")
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
         result, name = self._data.array._str_get_dummies(sep, dtype)
-        if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
+        if is_extension_array_dtype(dtype):
             return self._wrap_result(
                 DataFrame(result, columns=name, dtype=dtype),
                 name=name,

diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -421,7 +421,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
             dummies_dtype = _dtype
         else:
             dummies_dtype = np.bool_
-        dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype)
+        dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype, order="F")
 
         def _isin(test_elements: str, element: str) -> bool:
             return element in test_elements

diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 from pandas import (
@@ -13,11 +11,6 @@
     _testing as tm,
 )
 
-try:
-    import pyarrow as pa
-except ImportError:
-    pa = None
-
 
 def test_get_dummies(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
@@ -98,30 +91,19 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
 
 
 # GH#47872
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_get_dummies_with_str_dtype(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype=str)
-    expected = DataFrame(
-        [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]],
-        columns=list("abc"),
-        dtype=str,
-    )
-    tm.assert_frame_equal(result, expected)
+    with pytest.raises(
+        ValueError, match="string dtype not supported, please use a numeric dtype"
+    ):
+        s.str.get_dummies("|", dtype=str)
 
 
 # GH#47872
 @td.skip_if_no("pyarrow")
 def test_get_dummies_with_pa_str_dtype(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
-    result = s.str.get_dummies("|", dtype="str[pyarrow]")
-    expected = DataFrame(
-        [
-            ["true", "true", "false"],
-            ["true", "false", "true"],
-            ["false", "false", "false"],
-        ],
-        columns=list("abc"),
-        dtype="str[pyarrow]",
-    )
-    tm.assert_frame_equal(result, expected)
+    with pytest.raises(
+        ValueError, match="string dtype not supported, please use a numeric dtype"
+    ):
+        s.str.get_dummies("|", dtype="str[pyarrow]")