pandas-dev · jreback · Jul 10, 2022 · Jun 19, 2022 · Jun 22, 2022 · Jun 25, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -808,6 +808,7 @@ Conversion
 - Bug in metaclass of generic abstract dtypes causing :meth:`DataFrame.apply` and :meth:`Series.apply` to raise for the built-in function ``type`` (:issue:`46684`)
 - Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`)
 - Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`)
+- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`)
 
 Strings
 ^^^^^^^

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1283,9 +1283,9 @@ cdef class Seen:
         In addition to setting a flag that an integer was seen, we
         also set two flags depending on the type of integer seen:
 
-        1) sint_ : a negative (signed) number in the
+        1) sint_ : a signed numpy integer type or a negative (signed) number in the
                    range of [-2**63, 0) was encountered
-        2) uint_ : a positive number in the range of
+        2) uint_ : an unsigned numpy integer type or a positive number in the range of
                    [2**63, 2**64) was encountered
 
         Parameters
@@ -1294,8 +1294,16 @@ cdef class Seen:
             Value with which to set the flags.
         """
         self.int_ = True
-        self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
-        self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
+        self.sint_ = (
+            self.sint_
+            or (oINT64_MIN <= val < 0)
+            or util.is_sinteger_object(val)
+        )
+        self.uint_ = (
+            self.uint_
+            or (oINT64_MAX < val <= oUINT64_MAX)
+            or util.is_uinteger_object(val)
+        )
 
     @property
     def numeric_(self):
@@ -2542,7 +2550,6 @@ def maybe_convert_objects(ndarray[object] objects,
             floats[i] = <float64_t>val
             complexes[i] = <double complex>val
             if not seen.null_:
-                val = int(val)
                 seen.saw_int(val)
 
                 if ((seen.uint_ and seen.sint_) or

diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
@@ -33,6 +33,8 @@ from numpy cimport (
 
 
 cdef extern from "numpy/arrayobject.h":
+    PyTypeObject PySignedIntegerArrType_Type
+    PyTypeObject PyUnsignedIntegerArrType_Type
     PyTypeObject PyFloatingArrType_Type
 
 cdef extern from "numpy/ndarrayobject.h":
@@ -55,6 +57,38 @@ cdef inline int64_t get_nat():
 # --------------------------------------------------------------------
 # Type Checking
 
+cdef inline bint is_sinteger_object(object obj) nogil:
+    """
+    Cython equivalent of
+
+    `isinstance(val, np.signedinteger)`
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    is_sinteger : bool
+    """
+    return PyObject_TypeCheck(obj, &PySignedIntegerArrType_Type)
+
+cdef inline bint is_uinteger_object(object obj) nogil:
+    """
+    Cython equivalent of
+
+    `isinstance(val, np.unsignedinteger)`
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    is_uinteger : bool
+    """
+    return PyObject_TypeCheck(obj, &PyUnsignedIntegerArrType_Type)
+
 cdef inline bint is_integer_object(object obj) nogil:
     """
     Cython equivalent of

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value):
         result = lib.maybe_convert_objects(arr)
         tm.assert_numpy_array_equal(arr, result)
 
-    def test_maybe_convert_objects_uint64(self):
-        # see gh-4471
-        arr = np.array([2**63], dtype=object)
-        exp = np.array([2**63], dtype=np.uint64)
-        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
-
-        # NumPy bug: can't compare uint64 to int64, as that
-        # results in both casting to float64, so we should
-        # make sure that this function is robust against it
-        arr = np.array([np.uint64(2**63)], dtype=object)
-        exp = np.array([2**63], dtype=np.uint64)
-        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
-
-        arr = np.array([2, -1], dtype=object)
-        exp = np.array([2, -1], dtype=np.int64)
-        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
-
-        arr = np.array([2**63, -1], dtype=object)
-        exp = np.array([2**63, -1], dtype=object)
+    @pytest.mark.parametrize(
+        "value, expected_dtype",
+        [
+            # see gh-4471
+            ([2**63], np.uint64),
+            # NumPy bug: can't compare uint64 to int64, as that
+            # results in both casting to float64, so we should
+            # make sure that this function is robust against it
+            ([np.uint64(2**63)], np.uint64),
+            ([2, -1], np.int64),
+            ([2**63, -1], object),
+            # GH#47294
+            ([np.uint8(1)], np.uint8),
+            ([np.uint16(1)], np.uint16),
+            ([np.uint32(1)], np.uint32),
+            ([np.uint64(1)], np.uint64),
+            ([np.uint8(2), np.uint16(1)], np.uint16),
+            ([np.uint32(2), np.uint16(1)], np.uint32),
+            ([np.uint32(2), -1], object),
+            ([np.uint32(2), 1], np.uint64),
+            ([np.uint32(2), np.int32(1)], object),
+        ],
+    )
+    def test_maybe_convert_objects_uint(self, value, expected_dtype):
+        arr = np.array(value, dtype=object)
+        exp = np.array(value, dtype=expected_dtype)
         tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
 
     def test_maybe_convert_objects_datetime(self):

diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -57,7 +57,9 @@ class mystring(str):
         expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index)
         tm.assert_equal(df, expected)
 
-    @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
+    @pytest.mark.parametrize(
+        "dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"]
+    )
     def test_setitem_dtype(self, dtype, float_frame):
         arr = np.random.randn(len(float_frame))
 
@@ -210,17 +212,24 @@ def test_setitem_dict_preserves_dtypes(self):
                 "a": Series([0, 1, 2], dtype="int64"),
                 "b": Series([1, 2, 3], dtype=float),
                 "c": Series([1, 2, 3], dtype=float),
+                "d": Series([1, 2, 3], dtype="uint32"),
             }
         )
         df = DataFrame(
             {
                 "a": Series([], dtype="int64"),
                 "b": Series([], dtype=float),
                 "c": Series([], dtype=float),
+                "d": Series([], dtype="uint32"),
             }
         )
         for idx, b in enumerate([1, 2, 3]):
-            df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
+            df.loc[df.shape[0]] = {
+                "a": int(idx),
+                "b": float(b),
+                "c": float(b),
+                "d": np.uint32(b),
+            }
         tm.assert_frame_equal(df, expected)
 
     @pytest.mark.parametrize(

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values):
         assert result[0].dtype == object
         assert result[0][0] == value
 
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([1], dtype=np.uint16),
+            np.array([1], dtype=np.uint32),
+            np.array([1], dtype=np.uint64),
+            [np.uint16(1)],
+            [np.uint32(1)],
+            [np.uint64(1)],
+        ],
+    )
+    def test_constructor_numpy_uints(self, values):
+        # GH#47294
+        value = values[0]
+        result = DataFrame(values)
+
+        assert result[0].dtype == value.dtype
+        assert result[0][0] == value
+
     def test_constructor_ordereddict(self):
         import random
 

diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
@@ -540,10 +540,18 @@ def test_union_duplicates(index, request):
     mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
     mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
     result = mi1.union(mi2)
-    tm.assert_index_equal(result, mi2.sort_values())
+    expected = mi2.sort_values()
+    if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all():
+        # GH#47294 - union uses lib.fast_zip, converting data to Python integers
+        # and loses type information. Result is then unsigned only when values are
+        # sufficiently large to require unsigned dtype.
+        expected = expected.set_levels(
+            [expected.levels[0].astype(int), expected.levels[1]]
+        )
+    tm.assert_index_equal(result, expected)
 
     result = mi2.union(mi1)
-    tm.assert_index_equal(result, mi2.sort_values())
+    tm.assert_index_equal(result, expected)
 
 
 @pytest.mark.parametrize(

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self):
         expected = Series([1, 200, 50], dtype="uint8")
         tm.assert_series_equal(ser, expected)
 
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([1], dtype=np.uint16),
+            np.array([1], dtype=np.uint32),
+            np.array([1], dtype=np.uint64),
+            [np.uint16(1)],
+            [np.uint32(1)],
+            [np.uint64(1)],
+        ],
+    )
+    def test_constructor_numpy_uints(self, values):
+        # GH#47294
+        value = values[0]
+        result = Series(values)
+
+        assert result[0].dtype == value.dtype
+        assert result[0] == value
+
     def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype):
         # see gh-15832
         msg = "Trying to coerce negative values to unsigned integers"