DEPR: Changing default of str.extract(expand=False) to str.extract(expand=True) (#19118)

datapythonista · jorisvandenbossche · commit ce435dfefaec · 2018-02-05T09:39:43.000+01:00
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -218,7 +218,8 @@ Extract first match in each subject (extract)
    ``DataFrame``, depending on the subject and regular expression
    pattern (same behavior as pre-0.18.0). When ``expand=True`` it
    always returns a ``DataFrame``, which is more consistent and less
-   confusing from the perspective of a user.
+   confusing from the perspective of a user. ``expand=True`` is the
+   default since version 0.23.0.
 
 The ``extract`` method accepts a `regular expression
 <https://docs.python.org/3/library/re.html>`__ with at least one
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -296,6 +296,53 @@ Build Changes
 - Building from source now explicitly requires ``setuptools`` in ``setup.py`` (:issue:`18113`)
 - Updated conda recipe to be in compliance with conda-build 3.0+ (:issue:`18002`)
 
+Extraction of matching patterns from strings
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default, extracting matching patterns from strings with :func:`str.extract` used to return a
+``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was
+extracted``). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless
+``expand`` is set to ``False`` (:issue:`11386`).
+
+Also, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to
+``False``), but now raises a ``ValueError``.
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [1]: s = pd.Series(['number 10', '12 eggs'])
+
+    In [2]: extracted = s.str.extract('.*(\d\d).*')
+
+    In [3]: extracted
+    Out [3]:
+    0    10
+    1    12
+    dtype: object
+
+    In [4]: type(extracted)
+    Out [4]:
+    pandas.core.series.Series
+
+New Behavior:
+
+.. ipython:: python
+
+    s = pd.Series(['number 10', '12 eggs'])
+    extracted = s.str.extract('.*(\d\d).*')
+    extracted
+    type(extracted)
+
+To restore previous behavior, simply set ``expand`` to ``False``:
+
+.. ipython:: python
+
+    s = pd.Series(['number 10', '12 eggs'])
+    extracted = s.str.extract('.*(\d\d).*', expand=False)
+    extracted
+    type(extracted)
+
 .. _whatsnew_0230.api:
 
 Other API Changes
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -598,7 +598,7 @@ def _str_extract_frame(arr, pat, flags=0):
         dtype=object)
 
 
-def str_extract(arr, pat, flags=0, expand=None):
+def str_extract(arr, pat, flags=0, expand=True):
     r"""
     For each subject string in the Series, extract groups from the
     first match of regular expression pat.
@@ -610,7 +610,7 @@ def str_extract(arr, pat, flags=0, expand=None):
     flags : int, default 0 (no flags)
         re module flags, e.g. re.IGNORECASE
 
-    expand : bool, default False
+    expand : bool, default True
         * If True, return DataFrame.
         * If False, return Series/Index/DataFrame.
 
@@ -676,15 +676,6 @@ def str_extract(arr, pat, flags=0, expand=None):
     dtype: object
 
     """
-    if expand is None:
-        warnings.warn(
-            "currently extract(expand=None) " +
-            "means expand=False (return Index/Series/DataFrame) " +
-            "but in a future version of pandas this will be changed " +
-            "to expand=True (return DataFrame)",
-            FutureWarning,
-            stacklevel=3)
-        expand = False
     if not isinstance(expand, bool):
         raise ValueError("expand must be True or False")
     if expand:
@@ -1739,7 +1730,7 @@ def translate(self, table, deletechars=None):
     findall = _pat_wrapper(str_findall, flags=True)
 
     @copy(str_extract)
-    def extract(self, pat, flags=0, expand=None):
+    def extract(self, pat, flags=0, expand=True):
         return str_extract(self, pat, flags=flags, expand=expand)
 
     @copy(str_extractall)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -612,13 +612,16 @@ def test_match(self):
 
     def test_extract_expand_None(self):
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-        with tm.assert_produces_warning(FutureWarning):
+        with tm.assert_raises_regex(ValueError,
+                                    'expand must be True or False'):
             values.str.extract('.*(BAD[_]+).*(BAD)', expand=None)
 
     def test_extract_expand_unspecified(self):
         values = Series(['fooBAD__barBAD', NA, 'foo'])
-        with tm.assert_produces_warning(FutureWarning):
-            values.str.extract('.*(BAD[_]+).*(BAD)')
+        result_unspecified = values.str.extract('.*(BAD[_]+).*')
+        assert isinstance(result_unspecified, DataFrame)
+        result_true = values.str.extract('.*(BAD[_]+).*', expand=True)
+        tm.assert_frame_equal(result_unspecified, result_true)
 
     def test_extract_expand_False(self):
         # Contains tests like those in test_match and some others.