Skip to content

Commit ce435df

Browse files
datapythonistajorisvandenbossche
authored andcommitted
DEPR: Changing default of str.extract(expand=False) to str.extract(expand=True) (#19118)
1 parent de39a15 commit ce435df

File tree

4 files changed

+58
-16
lines changed

4 files changed

+58
-16
lines changed

doc/source/text.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,8 @@ Extract first match in each subject (extract)
218218
``DataFrame``, depending on the subject and regular expression
219219
pattern (same behavior as pre-0.18.0). When ``expand=True`` it
220220
always returns a ``DataFrame``, which is more consistent and less
221-
confusing from the perspective of a user.
221+
confusing from the perspective of a user. ``expand=True`` is the
222+
default since version 0.23.0.
222223

223224
The ``extract`` method accepts a `regular expression
224225
<https://docs.python.org/3/library/re.html>`__ with at least one

doc/source/whatsnew/v0.23.0.txt

+47
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,53 @@ Build Changes
296296
- Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`)
297297
- Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`)
298298

299+
Extraction of matching patterns from strings
300+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
301+
302+
By default, extracting matching patterns from strings with :func:`str.extract` used to return a
303+
``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was
304+
extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless
305+
``expand`` is set to ``False`` (:issue:`11386`).
306+
307+
Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to
308+
``False``), but now raises a ``ValueError``.
309+
310+
Previous Behavior:
311+
312+
.. code-block:: ipython
313+
314+
In [1]: s = pd.Series(['number 10', '12 eggs'])
315+
316+
In [2]: extracted = s.str.extract('.*(\d\d).*')
317+
318+
In [3]: extracted
319+
Out [3]:
320+
0 10
321+
1 12
322+
dtype: object
323+
324+
In [4]: type(extracted)
325+
Out [4]:
326+
pandas.core.series.Series
327+
328+
New Behavior:
329+
330+
.. ipython:: python
331+
332+
s = pd.Series(['number 10', '12 eggs'])
333+
extracted = s.str.extract('.*(\d\d).*')
334+
extracted
335+
type(extracted)
336+
337+
To restore previous behavior, simply set ``expand`` to ``False``:
338+
339+
.. ipython:: python
340+
341+
s = pd.Series(['number 10', '12 eggs'])
342+
extracted = s.str.extract('.*(\d\d).*', expand=False)
343+
extracted
344+
type(extracted)
345+
299346
.. _whatsnew_0230.api:
300347

301348
Other API Changes

pandas/core/strings.py

+3-12
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ def _str_extract_frame(arr, pat, flags=0):
598598
dtype=object)
599599

600600

601-
def str_extract(arr, pat, flags=0, expand=None):
601+
def str_extract(arr, pat, flags=0, expand=True):
602602
r"""
603603
For each subject string in the Series, extract groups from the
604604
first match of regular expression pat.
@@ -610,7 +610,7 @@ def str_extract(arr, pat, flags=0, expand=None):
610610
flags : int, default 0 (no flags)
611611
re module flags, e.g. re.IGNORECASE
612612
613-
expand : bool, default False
613+
expand : bool, default True
614614
* If True, return DataFrame.
615615
* If False, return Series/Index/DataFrame.
616616
@@ -676,15 +676,6 @@ def str_extract(arr, pat, flags=0, expand=None):
676676
dtype: object
677677
678678
"""
679-
if expand is None:
680-
warnings.warn(
681-
"currently extract(expand=None) " +
682-
"means expand=False (return Index/Series/DataFrame) " +
683-
"but in a future version of pandas this will be changed " +
684-
"to expand=True (return DataFrame)",
685-
FutureWarning,
686-
stacklevel=3)
687-
expand = False
688679
if not isinstance(expand, bool):
689680
raise ValueError("expand must be True or False")
690681
if expand:
@@ -1739,7 +1730,7 @@ def translate(self, table, deletechars=None):
17391730
findall = _pat_wrapper(str_findall, flags=True)
17401731

17411732
@copy(str_extract)
1742-
def extract(self, pat, flags=0, expand=None):
1733+
def extract(self, pat, flags=0, expand=True):
17431734
return str_extract(self, pat, flags=flags, expand=expand)
17441735

17451736
@copy(str_extractall)

pandas/tests/test_strings.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -612,13 +612,16 @@ def test_match(self):
612612

613613
def test_extract_expand_None(self):
614614
values = Series(['fooBAD__barBAD', NA, 'foo'])
615-
with tm.assert_produces_warning(FutureWarning):
615+
with tm.assert_raises_regex(ValueError,
616+
'expand must be True or False'):
616617
values.str.extract('.*(BAD[_]+).*(BAD)', expand=None)
617618

618619
def test_extract_expand_unspecified(self):
619620
values = Series(['fooBAD__barBAD', NA, 'foo'])
620-
with tm.assert_produces_warning(FutureWarning):
621-
values.str.extract('.*(BAD[_]+).*(BAD)')
621+
result_unspecified = values.str.extract('.*(BAD[_]+).*')
622+
assert isinstance(result_unspecified, DataFrame)
623+
result_true = values.str.extract('.*(BAD[_]+).*', expand=True)
624+
tm.assert_frame_equal(result_unspecified, result_true)
622625

623626
def test_extract_expand_False(self):
624627
# Contains tests like those in test_match and some others.

0 commit comments

Comments
 (0)