pandas-dev · TomAugspurger · Nov 19, 2019 · Nov 15, 2019 · Nov 15, 2019 · Nov 16, 2019
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -13,7 +13,7 @@ Text Data Types
 
 .. versionadded:: 1.0.0
 
-There are two main ways to store text data
+There are two ways to store text data in pandas:
 
 1. ``object`` -dtype NumPy array.
 2. :class:`StringDtype` extension type.
@@ -63,7 +63,39 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created
    s
    s.astype("string")
 
-Everything that follows in the rest of this document applies equally to
+.. _text.differences:
+
+Behavior differences
+^^^^^^^^^^^^^^^^^^^^
+
+These are places where the behavior of ``StringDtype`` objects differ from
+``object`` dtype
+
+l. For ``StringDtype``, :ref:`String accessor methods<api.series.str>`
+   that return **numeric** output will always return a nullable integer dtype,
+   rather either int or float dtype, depending on the presence of NA values.
+
+   .. ipython:: python
+
+      s = pd.Series(["a", None, "b"], dtype="string")
+      s.str.count("a")
+      s.dropna().str.count("a")
+
+   Both outputs are ``Int64`` dtype. Compare that with object-dtype
+
+   .. ipython:: python
+
+      s.astype(object).str.count("a")
+      s.astype(object).dropna().str.count("a")
+
+   When NA values are present, the output dtype is float64.
+
+2. Some string methods, like :meth:`Series.str.decode` are not available
+   on ``StringArray`` because ``StringArray`` only holds strings, not
+   bytes.
+
+
+Everything else that follows in the rest of this document applies equally to
 ``string`` and ``object`` dtype.
 
 .. _text.string_methods:

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -63,7 +63,7 @@ Previously, strings were typically stored in object-dtype NumPy arrays.
    ``StringDtype`` is currently considered experimental. The implementation
    and parts of the API may change without warning.
 
-The text extension type solves several issues with object-dtype NumPy arrays:
+The ``'string'`` extension type solves several issues with object-dtype NumPy arrays:
 
 1. You can accidentally store a *mixture* of strings and non-strings in an
    ``object`` dtype array. A ``StringArray`` can only store strings.

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2208,9 +2208,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     return objects
 
 
+_no_default = object()
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1):
+def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1,
+                   object na_value=_no_default, object dtype=object):
     """
     Substitute for np.vectorize with pandas-friendly dtype inference
 
@@ -2225,14 +2229,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1)
     """
     cdef:
         Py_ssize_t i, n
-        ndarray[object] result
+        ndarray result
         object val
 
     n = len(arr)
-    result = np.empty(n, dtype=object)
+    result = np.empty(n, dtype=dtype)
     for i in range(n):
         if mask[i]:
-            val = arr[i]
+            if na_value is _no_default:
+                val = arr[i]
+            else:
+                val = na_value
         else:
             val = f(arr[i])
 

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -2,7 +2,7 @@
 from functools import wraps
 import re
 import textwrap
-from typing import Dict, List
+from typing import TYPE_CHECKING, Any, Callable, Dict, List
 import warnings
 
 import numpy as np
@@ -15,10 +15,14 @@
     ensure_object,
     is_bool_dtype,
     is_categorical_dtype,
+    is_extension_array_dtype,
     is_integer,
+    is_integer_dtype,
     is_list_like,
+    is_object_dtype,
     is_re,
     is_scalar,
+    is_string_dtype,
 )
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -28,9 +32,14 @@
 )
 from pandas.core.dtypes.missing import isna
 
+from pandas._typing import ArrayLike, Dtype
 from pandas.core.algorithms import take_1d
 from pandas.core.base import NoNewAttributesMixin
 import pandas.core.common as com
+from pandas.core.construction import extract_array
+
+if TYPE_CHECKING:
+    from pandas.arrays import StringArray
 
 _cpython_optimized_encoders = (
     "utf-8",
@@ -109,9 +118,48 @@ def cat_safe(list_of_columns: List, sep: str):
 
 def _na_map(f, arr, na_result=np.nan, dtype=object):
     # should really _check_ for NA
+    if is_extension_array_dtype(arr.dtype):
+        arr = extract_array(arr, extract_numpy=True)
+        return _map_ea(f, arr, na_value=na_result, dtype=dtype)
     return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
 
 
+def _map_ea(
+    func: Callable, arr: "StringArray", na_value: Any, dtype: Dtype
+) -> ArrayLike:
+    from pandas.arrays import IntegerArray, StringArray
+
+    mask = isna(arr)
+
+    assert isinstance(arr, StringArray)
+    arr = arr._ndarray
+
+    if is_integer_dtype(dtype):
+        na_value_is_na = isna(na_value)
+        if na_value_is_na:
+            na_value = 1
+        result = lib.map_infer_mask(
+            arr,
+            func,
+            mask.view("uint8"),
+            convert=False,
+            na_value=na_value,
+            dtype=np.dtype("int64"),
+        )
+
+        if not na_value_is_na:
+            mask[:] = False
+
+        return IntegerArray(result, mask)
+
+    elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+        result = lib.map_infer_mask(arr, func, mask.view("uint8"), na_value=na_value)
+        return StringArray(result)
+    # TODO: BooleanArray
+    else:
+        return lib.map_infer_mask(arr, func, mask.view("uint8"))
+
+
 def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
     if not len(arr):
         return np.ndarray(0, dtype=dtype)

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -731,7 +731,10 @@ def test_count(self):
         tm.assert_series_equal(result, exp)
 
         # mixed
-        mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]
+        mixed = np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
         rs = strings.str_count(mixed, "a")
         xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
         tm.assert_numpy_array_equal(rs, xp)
@@ -755,14 +758,14 @@ def test_contains(self):
         expected = np.array([False, np.nan, False, False, True], dtype=np.object_)
         tm.assert_numpy_array_equal(result, expected)
 
-        values = ["foo", "xyz", "fooommm__foo", "mmm_"]
+        values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)
         result = strings.str_contains(values, pat)
         expected = np.array([False, False, True, True])
         assert result.dtype == np.bool_
         tm.assert_numpy_array_equal(result, expected)
 
         # case insensitive using regex
-        values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"]
+        values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)
         result = strings.str_contains(values, "FOO|mmm", case=False)
         expected = np.array([True, False, True, True])
         tm.assert_numpy_array_equal(result, expected)
@@ -773,7 +776,10 @@ def test_contains(self):
         tm.assert_numpy_array_equal(result, expected)
 
         # mixed
-        mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]
+        mixed = np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
         rs = strings.str_contains(mixed, "o")
         xp = np.array(
             [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan],
@@ -869,7 +875,10 @@ def test_endswith(self):
         tm.assert_series_equal(result, exp.fillna(False).astype(bool))
 
         # mixed
-        mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]
+        mixed = np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
         rs = strings.str_endswith(mixed, "f")
         xp = np.array(
             [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan],
@@ -3488,10 +3497,13 @@ def test_casefold(self):
 
 
 def test_string_array(any_string_method):
+    method_name, args, kwargs = any_string_method
+    if method_name == "decode":
+        pytest.skip("decode requires bytes.")
+
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
     b = Series(data, dtype="string")
-    method_name, args, kwargs = any_string_method
 
     expected = getattr(a.str, method_name)(*args, **kwargs)
     result = getattr(b.str, method_name)(*args, **kwargs)
@@ -3502,8 +3514,29 @@ def test_string_array(any_string_method):
         ):
             assert result.dtype == "string"
             result = result.astype(object)
+
+        elif expected.dtype == "float" and expected.isna().any():
+            assert result.dtype == "Int64"
+            result = result.astype("float")
+
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
         assert all(result[columns].dtypes == "string")
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,args,expected",
+    [
+        ("count", ("a",), [2, None]),
+        ("find", ("a",), [0, None]),
+        ("index", ("a",), [0, None]),
+        ("rindex", ("a",), [2, None]),
+    ],
+)
+def test_string_array_numeric_integer_array(method, args, expected):
+    s = Series(["aba", None], dtype="string")
+    result = getattr(s.str, method)(*args)
+    expected = Series(expected, dtype="Int64")
+    tm.assert_series_equal(result, expected)