Skip to content

REGR: maybe_convert_objects ignoring uints #47475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 10, 2022
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,7 @@ Conversion
- Bug in metaclass of generic abstract dtypes causing :meth:`DataFrame.apply` and :meth:`Series.apply` to raise for the built-in function ``type`` (:issue:`46684`)
- Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`)
- Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`)
- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`)

Strings
^^^^^^^
Expand Down
17 changes: 12 additions & 5 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1283,9 +1283,9 @@ cdef class Seen:
In addition to setting a flag that an integer was seen, we
also set two flags depending on the type of integer seen:

1) sint_ : a negative (signed) number in the
1) sint_ : a signed numpy integer type or a negative (signed) number in the
range of [-2**63, 0) was encountered
2) uint_ : a positive number in the range of
2) uint_ : an unsigned numpy integer type or a positive number in the range of
[2**63, 2**64) was encountered

Parameters
Expand All @@ -1294,8 +1294,16 @@ cdef class Seen:
Value with which to set the flags.
"""
self.int_ = True
self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
self.sint_ = (
self.sint_
or (oINT64_MIN <= val < 0)
or util.is_sinteger_object(val)
)
self.uint_ = (
self.uint_
or (oINT64_MAX < val <= oUINT64_MAX)
or util.is_uinteger_object(val)
)

@property
def numeric_(self):
Expand Down Expand Up @@ -2542,7 +2550,6 @@ def maybe_convert_objects(ndarray[object] objects,
floats[i] = <float64_t>val
complexes[i] = <double complex>val
if not seen.null_:
val = int(val)
seen.saw_int(val)

if ((seen.uint_ and seen.sint_) or
Expand Down
34 changes: 34 additions & 0 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ from numpy cimport (


cdef extern from "numpy/arrayobject.h":
PyTypeObject PySignedIntegerArrType_Type
PyTypeObject PyUnsignedIntegerArrType_Type
PyTypeObject PyFloatingArrType_Type

cdef extern from "numpy/ndarrayobject.h":
Expand All @@ -55,6 +57,38 @@ cdef inline int64_t get_nat():
# --------------------------------------------------------------------
# Type Checking

cdef inline bint is_sinteger_object(object obj) nogil:
"""
Cython equivalent of

`isinstance(val, np.signedinteger)`

Parameters
----------
val : object

Returns
-------
is_sinteger : bool
"""
return PyObject_TypeCheck(obj, &PySignedIntegerArrType_Type)

cdef inline bint is_uinteger_object(object obj) nogil:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you expect these to be used elsewhere? if not, i think the build might be marginally faster (maybe even smaller) if they go directly in lib.pyx

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd hazard a guess no. Also, I had reservations on the is_sinteger_object function anyways; it would return False even if e.g. -1 is passed, which is possibly the wrong answer depending on use. But every other function there only looked at the type of object, so it seemed wrong to add a function with value-dependent behavior. Inlining these gets around that.

"""
Cython equivalent of

`isinstance(val, np.unsignedinteger)`

Parameters
----------
val : object

Returns
-------
is_uinteger : bool
"""
return PyObject_TypeCheck(obj, &PyUnsignedIntegerArrType_Type)

cdef inline bint is_integer_object(object obj) nogil:
"""
Cython equivalent of
Expand Down
45 changes: 26 additions & 19 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,25 +700,32 @@ def test_convert_int_overflow(self, value):
result = lib.maybe_convert_objects(arr)
tm.assert_numpy_array_equal(arr, result)

def test_maybe_convert_objects_uint64(self):
# see gh-4471
arr = np.array([2**63], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

# NumPy bug: can't compare uint64 to int64, as that
# results in both casting to float64, so we should
# make sure that this function is robust against it
arr = np.array([np.uint64(2**63)], dtype=object)
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

arr = np.array([2, -1], dtype=object)
exp = np.array([2, -1], dtype=np.int64)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

arr = np.array([2**63, -1], dtype=object)
exp = np.array([2**63, -1], dtype=object)
@pytest.mark.parametrize(
"value, expected_dtype",
[
# see gh-4471
([2**63], np.uint64),
# NumPy bug: can't compare uint64 to int64, as that
# results in both casting to float64, so we should
# make sure that this function is robust against it
([np.uint64(2**63)], np.uint64),
([2, -1], np.int64),
([2**63, -1], object),
# GH#47294
([np.uint8(1)], np.uint8),
([np.uint16(1)], np.uint16),
([np.uint32(1)], np.uint32),
([np.uint64(1)], np.uint64),
([np.uint8(2), np.uint16(1)], np.uint16),
([np.uint32(2), np.uint16(1)], np.uint32),
([np.uint32(2), -1], object),
([np.uint32(2), 1], np.uint64),
([np.uint32(2), np.int32(1)], object),
],
)
def test_maybe_convert_objects_uint(self, value, expected_dtype):
arr = np.array(value, dtype=object)
exp = np.array(value, dtype=expected_dtype)
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

def test_maybe_convert_objects_datetime(self):
Expand Down
13 changes: 11 additions & 2 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ class mystring(str):
expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index)
tm.assert_equal(df, expected)

@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
@pytest.mark.parametrize(
"dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"]
)
def test_setitem_dtype(self, dtype, float_frame):
arr = np.random.randn(len(float_frame))

Expand Down Expand Up @@ -210,17 +212,24 @@ def test_setitem_dict_preserves_dtypes(self):
"a": Series([0, 1, 2], dtype="int64"),
"b": Series([1, 2, 3], dtype=float),
"c": Series([1, 2, 3], dtype=float),
"d": Series([1, 2, 3], dtype="uint32"),
}
)
df = DataFrame(
{
"a": Series([], dtype="int64"),
"b": Series([], dtype=float),
"c": Series([], dtype=float),
"d": Series([], dtype="uint32"),
}
)
for idx, b in enumerate([1, 2, 3]):
df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
df.loc[df.shape[0]] = {
"a": int(idx),
"b": float(b),
"c": float(b),
"d": np.uint32(b),
}
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize(
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,25 @@ def test_constructor_int_overflow(self, values):
assert result[0].dtype == object
assert result[0][0] == value

@pytest.mark.parametrize(
"values",
[
np.array([1], dtype=np.uint16),
np.array([1], dtype=np.uint32),
np.array([1], dtype=np.uint64),
[np.uint16(1)],
[np.uint32(1)],
[np.uint64(1)],
],
)
def test_constructor_numpy_uints(self, values):
# GH#47294
value = values[0]
result = DataFrame(values)

assert result[0].dtype == value.dtype
assert result[0][0] == value

def test_constructor_ordereddict(self):
import random

Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,10 +540,18 @@ def test_union_duplicates(index, request):
mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
result = mi1.union(mi2)
tm.assert_index_equal(result, mi2.sort_values())
expected = mi2.sort_values()
if mi2.levels[0].dtype == np.uint64 and (mi2.get_level_values(0) < 2**63).all():
# GH#47294 - union uses lib.fast_zip, converting data to Python integers
# and loses type information. Result is then unsigned only when values are
# sufficiently large to require unsigned dtype.
expected = expected.set_levels(
[expected.levels[0].astype(int), expected.levels[1]]
)
tm.assert_index_equal(result, expected)

result = mi2.union(mi1)
tm.assert_index_equal(result, mi2.sort_values())
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize(
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,25 @@ def test_constructor_signed_int_overflow_deprecation(self):
expected = Series([1, 200, 50], dtype="uint8")
tm.assert_series_equal(ser, expected)

@pytest.mark.parametrize(
"values",
[
np.array([1], dtype=np.uint16),
np.array([1], dtype=np.uint32),
np.array([1], dtype=np.uint64),
[np.uint16(1)],
[np.uint32(1)],
[np.uint64(1)],
],
)
def test_constructor_numpy_uints(self, values):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are pd.Index or pd.array affected? pd.NumericIndex?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.array and pd.NumericIndex are not impacted; I've added tests for Index.

arr = pd.array([np.uint16(1)])
print(arr)

# <IntegerArray>
# [1]
# Length: 1, dtype: Int64

index = NumericIndex([np.uint16(1)])
print(index)

# NumericIndex([1], dtype='uint16')

Both of these are the same as 1.4.3

# GH#47294
value = values[0]
result = Series(values)

assert result[0].dtype == value.dtype
assert result[0] == value

def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype):
# see gh-15832
msg = "Trying to coerce negative values to unsigned integers"
Expand Down