Open
Description
While cleaning up some string tests, I noticed that the setitem validation error message was different between pyarrow vs python storage for StringDtype (and will do a PR to make that consistent), but that made me wonder how the situation is in general. Creating an overview here, similarly to #59580 (error messages in reduction operations).
dtype | val | exception | message |
---|---|---|---|
string | int | TypeError | Scalar must be NA or str |
datetime | int | TypeError | value should be a 'Timestamp', 'NaT', or array of those. Got 'int' instead. |
datetime-tz | int | TypeError | value should be a 'Timestamp', 'NaT', or array of those. Got 'int' instead. |
datetime-tz | timestamp | TypeError | Cannot compare tz-naive and tz-aware datetime-like objects |
period | timestamp | TypeError | value should be a 'Period', 'NaT', or array of those. Got 'Timestamp' instead. |
timedelta | timestamp | TypeError | value should be a 'Timedelta', 'NaT', or array of those. Got 'Timestamp' instead. |
range | str | ValueError | invalid literal for int() with base 10: 'str' |
range | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
int8 | str | ValueError | invalid literal for int() with base 10: 'str' |
int8 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
int8 | interval | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'pandas._libs.interval.Interval' |
int16 | str | ValueError | invalid literal for int() with base 10: 'str' |
int16 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
int32 | str | ValueError | invalid literal for int() with base 10: 'str' |
int32 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
int64 | str | ValueError | invalid literal for int() with base 10: 'str' |
int64 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
uint8 | str | ValueError | invalid literal for int() with base 10: 'str' |
uint8 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
uint16 | str | ValueError | invalid literal for int() with base 10: 'str' |
uint16 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
uint32 | str | ValueError | invalid literal for int() with base 10: 'str' |
uint32 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
uint64 | str | ValueError | invalid literal for int() with base 10: 'str' |
uint64 | timestamp | TypeError | int() argument must be a string, a bytes-like object or a real number, not 'Timestamp' |
float32 | str | ValueError | could not convert string to float: 'str' |
float32 | timestamp | TypeError | float() argument must be a string or a real number, not 'Timestamp' |
float64 | str | ValueError | could not convert string to float: 'str' |
float64 | timestamp | TypeError | float() argument must be a string or a real number, not 'Timestamp' |
complex64 | str | ValueError | complex() arg is a malformed string |
complex64 | timestamp | TypeError | must be real number, not Timestamp |
complex128 | str | ValueError | complex() arg is a malformed string |
complex128 | timestamp | TypeError | must be real number, not Timestamp |
categorical | int | TypeError | Cannot setitem on a Categorical with a new category (1), set the categories first |
categorical | timestamp | TypeError | Cannot setitem on a Categorical with a new category (2020-01-01 00:00:00), set the categories first |
interval | int | TypeError | 'value' should be an interval type, got <class 'int'> instead. |
nullable_int | str | TypeError | Invalid value 'str' for dtype Int64 |
nullable_int | timestamp | TypeError | Invalid value '2020-01-01 00:00:00' for dtype Int64 |
nullable_int | interval | TypeError | Invalid value '(0, 1]' for dtype Int64 |
nullable_uint | str | TypeError | Invalid value 'str' for dtype UInt16 |
nullable_float | str | TypeError | Invalid value 'str' for dtype Float32 |
nullable_bool | int | TypeError | Invalid value '1' for dtype boolean |
nullable_bool | str | TypeError | Invalid value 'str' for dtype boolean |
string-python | int | TypeError | Cannot set non-string value '1' into a StringArray. |
string-python | timestamp | TypeError | Cannot set non-string value '2020-01-01 00:00:00' into a StringArray. |
string-pyarrow | int | TypeError | Scalar must be NA or str |
string-pyarrow | timestamp | TypeError | Scalar must be NA or str |
The code to generate the table above (the above table is a trimmed version of the result, removing some lines with identical results):
import numpy as np
import pandas as pd
from pandas import Index, CategoricalIndex, IntervalIndex
# from conftest.py
indices_dict = {
"object": Index([f"pandas_{i}" for i in range(10)], dtype=object),
"string": Index([f"pandas_{i}" for i in range(10)], dtype="str"),
"datetime": pd.date_range("2020-01-01", periods=10),
"datetime-tz": pd.date_range("2020-01-01", periods=10, tz="US/Pacific"),
"period": pd.period_range("2020-01-01", periods=10, freq="D"),
"timedelta": pd.timedelta_range(start="1 day", periods=10, freq="D"),
"range": pd.RangeIndex(10),
"int8": Index(np.arange(10), dtype="int8"),
"int16": Index(np.arange(10), dtype="int16"),
"int32": Index(np.arange(10), dtype="int32"),
"int64": Index(np.arange(10), dtype="int64"),
"uint8": Index(np.arange(10), dtype="uint8"),
"uint16": Index(np.arange(10), dtype="uint16"),
"uint32": Index(np.arange(10), dtype="uint32"),
"uint64": Index(np.arange(10), dtype="uint64"),
"float32": Index(np.arange(10), dtype="float32"),
"float64": Index(np.arange(10), dtype="float64"),
"bool-object": Index([True, False] * 5, dtype=object),
"bool-dtype": Index([True, False] * 5, dtype=bool),
"complex64": Index(
np.arange(10, dtype="complex64") + 1.0j * np.arange(10, dtype="complex64")
),
"complex128": Index(
np.arange(10, dtype="complex128") + 1.0j * np.arange(10, dtype="complex128")
),
"categorical": CategoricalIndex(list("abcd") * 2),
"interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=11)),
# "empty": Index([]),
"nullable_int": Index(np.arange(10), dtype="Int64"),
"nullable_uint": Index(np.arange(10), dtype="UInt16"),
"nullable_float": Index(np.arange(10), dtype="Float32"),
"nullable_bool": Index(np.arange(10).astype(bool), dtype="boolean"),
"string-python": Index(
pd.array([f"pandas_{i}" for i in range(10)], dtype="string[python]")
),
"string-pyarrow": Index(pd.array([f"pandas_{i}" for i in range(10)], dtype="string[pyarrow]"))
}
results = []
for dtype, data in indices_dict.items():
for val, val_type in [
(1, "int"),
("str", "str"),
(pd.Timestamp("2020-01-01"), "timestamp"),
(pd.Interval(0, 1), "interval")
]:
try:
data.array[0] = val
except Exception as e:
# print(dtype, val, type(e), e)
results.append((dtype, val_type, str(type(e).__name__), str(e)))
df = pd.DataFrame(results, columns=["dtype", "val", "exception", "message"])
print(df)
print(df.set_index("dtype").to_markdown())