pandas-dev · jreback · Jan 24, 2020 · Jan 11, 2020 · Jan 11, 2020 · Jan 13, 2020
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -806,7 +806,8 @@ dtype, it will use ``pd.NA``:
 
 Currently, pandas does not yet use those data types by default (when creating
 a DataFrame or Series, or when reading in data), so you need to specify
-the dtype explicitly.
+the dtype explicitly.  An easy way to convert to those dtypes is explained
+:ref:`here <missing_data.NA.Conversion>`.
 
 Propagation in arithmetic and comparison operations
 ---------------------------------------------------
@@ -945,3 +946,25 @@ work with ``NA``, and generally return ``NA``:
    in the future.
 
 See :ref:`dsintro.numpy_interop` for more on ufuncs.
+
+.. _missing_data.NA.Conversion:
+
+Conversion
+----------
+
+If you have a DataFrame or Series using traditional types that have missing data
+represented using ``np.nan``, there are convenience methods
+:meth:`~Series.as_nullable_dtypes` in Series and :meth:`~DataFrame.as_nullable_dtypes`
+in DataFrame that can convert data to use the newer dtypes for integers, strings and
+booleans listed :ref:`here <basics.dtypes>`. This is especially helpful after reading
+in data sets when letting the readers infer default dtypes.
+
+In this example, while the dtypes of all columns are changed, we show the results for
+the first 10 columns.
+
+.. ipython:: python
+
+   bb = pd.read_csv('data/baseball.csv', index_col='id')
+   bb[bb.columns[:10]].dtypes
+   bbn = bb.as_nullable_dtypes()
+   bbn[bbn.columns[:10]].dtypes
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -228,6 +228,7 @@ Other enhancements
 - Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)
 - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`)
 - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`)
+- Added :meth:`DataFrame.as_nullable_dtypes` and :meth:`Series.as_nullable_dtypes` to make it easier to use ``pd.NA`` (:issue:`29752`)
 
 
 Build Changes

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -62,6 +62,7 @@
     is_extension_array_dtype,
     is_float,
     is_integer,
+    is_integer_dtype,
     is_list_like,
     is_number,
     is_numeric_dtype,
@@ -72,6 +73,7 @@
     is_timedelta64_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import registry
 from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
 from pandas.core.dtypes.inference import is_hashable
 from pandas.core.dtypes.missing import isna, notna
@@ -5906,6 +5908,129 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries:
             )
         ).__finalize__(self)
 
+    # ----------------------------------------------------------------------
+    # Convert to types that support pd.NA
+
+    def _as_nullable_dtype(self: ABCSeries) -> ABCSeries:
+        """
+        Handle one Series
+
+        Rules:
+        If an object, see if we can infer string, boolean or integer, otherwise leave
+        alone
+        If an integer and not an extension type, convert to the Int64/Int32 type
+        (platform dependent)
+        If numeric, see if we can infer integer, otherwise try to use astype() to make
+        it integer.
+
+        """
+        dtype = self.dtype
+        new_dtype = dtype
+        changeit = False
+        constructit = True
+        result = self
+        target_int_dtype = "Int64"
+
+        if is_object_dtype(dtype):
+            new_dtype = lib.infer_dtype(self)
+            if new_dtype not in {"string", "boolean", "integer"}:
+                new_dtype = dtype
+            else:
+                changeit = True
+        elif is_integer_dtype(dtype):
+            if not is_extension_array_dtype(dtype):
+                new_dtype = "integer"
+                changeit = True
+        elif is_numeric_dtype(dtype):
+            new_dtype = dtype
+            try:
+                result = self.astype(target_int_dtype)
+                new_dtype = target_int_dtype
+                changeit = False
+                constructit = False
+            except TypeError:
+                pass
+
+        if changeit:
+            if new_dtype == "integer":
+                new_dtype = {
+                    sd.type: sd.name
+                    for sd in registry.dtypes
+                    if isinstance(sd.name, str) and "Int" in sd.name
+                }.get(dtype.type, target_int_dtype)
+            result = self.astype(new_dtype)
+        else:
+            if constructit:
+                result = self._constructor(self).__finalize__(self)
+
+        return result
+
+    def as_nullable_dtypes(self: FrameOrSeries) -> FrameOrSeries:
+        """
+        Convert columns of DataFrame or a Series to types supporting ``pd.NA``.
+
+        | If the dtype is "object", convert to "string", "boolean" or an appropriate
+          integer type.
+        | If the dtype is "integer", convert to an appropriate integer type.
+        | If the dtype is numeric, and consists of all integers, convert to an
+          appropriate type.
+
+        Returns
+        -------
+        converted : a copy of the same type as caller
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
+        ...         "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
+        ...         "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
+        ...         "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
+        ...         "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
+        ...         "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
+        ...     }
+        ... )
+
+        >>> df
+           a  b      c    d     e      f
+        0  1  x   True    h  10.0    NaN
+        1  2  y  False    i   NaN  100.5
+        2  3  z    NaN  NaN  20.0  200.0
+
+        >>> df.dtypes
+        a      int32
+        b     object
+        c     object
+        d     object
+        e    float64
+        f    float64
+        dtype: object
+
+        >>> dfn = df.as_nullable_dtypes()
+        >>> dfn
+           a  b      c     d     e      f
+        0  1  x   True     h    10    NaN
+        1  2  y  False     i  <NA>  100.5
+        2  3  z   <NA>  <NA>    20  200.0
+
+        >>> dfn.dtypes
+        a      Int32
+        b     string
+        c    boolean
+        d     string
+        e      Int64
+        f    float64
+        dtype: object
+        """
+        if self.ndim == 1:
+            return self._as_nullable_dtype()
+        else:
+            results = [col._as_nullable_dtype() for col_name, col in self.items()]
+            result = pd.concat(results, axis=1, copy=False)
+            result.columns = self.columns
+            return result
+
     # ----------------------------------------------------------------------
     # Filling NA's
 

diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
@@ -1072,6 +1072,24 @@ def test_str_to_small_float_conversion_type(self):
         expected = pd.DataFrame(col_data, columns=["A"], dtype=float)
         tm.assert_frame_equal(result, expected)
 
+    def test_as_nullable_dtypes(self):
+        # Specific types are tested in tests/series/test_dtypes.py
+        # Just check that it works for DataFrame here
+        df = pd.DataFrame(
+            {
+                "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
+                "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
+            }
+        )
+        result = df.as_nullable_dtypes()
+        expected = pd.DataFrame(
+            {
+                "a": pd.Series([1, 2, 3], dtype="Int32"),
+                "b": pd.Series(["x", "y", "z"], dtype="string"),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameDatetimeWithTZ:
     def test_interleave(self, timezone_frame):

diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
@@ -487,3 +487,25 @@ def test_reindex_astype_order_consistency(self):
         s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype)
         s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype)
         tm.assert_series_equal(s1, s2)
+
+    @pytest.mark.parametrize(
+        "stup",
+        [
+            (Series([1, 2, 3], dtype=np.dtype("int32")), "Int32"),
+            (Series([1, 2, 3], dtype=np.dtype("int64")), "Int64"),
+            (Series(["x", "y", "z"], dtype=np.dtype("O")), pd.StringDtype()),
+            (Series([True, False, np.nan], dtype=np.dtype("O")), pd.BooleanDtype()),
+            (Series(["h", "i", np.nan], dtype=np.dtype("O")), pd.StringDtype()),
+            (Series([10, np.nan, 20], dtype=np.dtype("float")), pd.Int64Dtype()),
+            (Series([np.nan, 100.5, 200], dtype=np.dtype("float")), np.dtype("float")),
+            (Series([3, 4, 5], dtype="Int8"), "Int8"),
+            (Series([[1, 2], [3, 4], [5]]), np.dtype("O")),
+            (Series([4, 5, 6], dtype=np.dtype("uint32")), "UInt32"),
+            (Series([-10, 12, 13], dtype=np.dtype("i1")), "Int8"),
+        ],
+    )
+    def test_as_nullable_dtypes(self, stup):
+        s = stup[0]
+        expected_dtype = stup[1]
+        ns = s.as_nullable_dtypes()
+        assert ns.dtype == expected_dtype