Skip to content

ENH: Implement convert_dtypes on block level #55341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 17, 2023
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ def convert_dtypes(
base_dtype = inferred_dtype
if (
base_dtype.kind == "O" # type: ignore[union-attr]
and len(input_array) > 0
and input_array.size > 0
and isna(input_array).all()
):
import pyarrow as pa
Expand Down
40 changes: 10 additions & 30 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6940,36 +6940,16 @@ def convert_dtypes(
dtype: string
"""
check_dtype_backend(dtype_backend)
if self.ndim == 1:
return self._convert_dtypes(
infer_objects,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
dtype_backend=dtype_backend,
)
else:
results = [
col._convert_dtypes(
infer_objects,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
dtype_backend=dtype_backend,
)
for col_name, col in self.items()
]
if len(results) > 0:
result = concat(results, axis=1, copy=False, keys=self.columns)
cons = cast(type["DataFrame"], self._constructor)
result = cons(result)
result = result.__finalize__(self, method="convert_dtypes")
# https://github.com/python/mypy/issues/8354
return cast(Self, result)
else:
return self.copy(deep=None)
new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr]
infer_objects=infer_objects,
convert_string=convert_string,
convert_integer=convert_integer,
convert_boolean=convert_boolean,
convert_floating=convert_floating,
dtype_backend=dtype_backend,
)
res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
return res.__finalize__(self, method="convert_dtypes")

# ----------------------------------------------------------------------
# Filling NA's
Expand Down
55 changes: 55 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeBackend,
DtypeObj,
F,
FillnaOptions,
Expand All @@ -55,6 +56,7 @@
from pandas.core.dtypes.cast import (
LossySetitemError,
can_hold_element,
convert_dtypes,
find_result_type,
maybe_downcast_to_dtype,
np_can_hold_element,
Expand Down Expand Up @@ -636,6 +638,52 @@ def convert(
res_values = maybe_coerce_values(res_values)
return [self.make_block(res_values, refs=refs)]

def convert_dtypes(
self,
copy: bool,
using_cow: bool,
infer_objects: bool = True,
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
convert_floating: bool = True,
dtype_backend: DtypeBackend = "numpy_nullable",
) -> list[Block]:
if infer_objects and self.is_object:
blks = self.convert(copy=False, using_cow=using_cow)
else:
blks = [self]

if not any(
[convert_floating, convert_integer, convert_boolean, convert_string]
):
return [b.copy(deep=copy) for b in blks]

rbs = []
for blk in blks:
# Determine dtype column by column
sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split()
dtypes = [
convert_dtypes(
b.values,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
infer_objects,
dtype_backend,
)
for b in sub_blks
]
if all(dtype == self.dtype for dtype in dtypes):
# Avoid block splitting if no dtype changes
rbs.append(blk.copy(deep=copy))
continue

for dtype, b in zip(dtypes, sub_blks):
rbs.append(b.astype(dtype=dtype, copy=copy, squeeze=b.ndim != 1))
return rbs

# ---------------------------------------------------------------------
# Array-Like Methods

Expand All @@ -651,6 +699,7 @@ def astype(
copy: bool = False,
errors: IgnoreRaise = "raise",
using_cow: bool = False,
squeeze: bool = False,
) -> Block:
"""
Coerce to the new dtype.
Expand All @@ -665,12 +714,18 @@ def astype(
- ``ignore`` : suppress exceptions. On error return original object
using_cow: bool, default False
Signaling if copy on write copy logic is used.
squeeze : bool, default False
squeeze values to ndim=1 if only one column is given

Returns
-------
Block
"""
values = self.values
if squeeze and values.ndim == 2:
if values.shape[0] != 1:
raise ValueError("Can not squeeze with more than one column.")
values = values[0, :] # type: ignore[call-overload]

new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)

Expand Down
10 changes: 10 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,16 @@ def convert(self, copy: bool | None) -> Self:

return self.apply("convert", copy=copy, using_cow=using_copy_on_write())

def convert_dtypes(self, **kwargs):
if using_copy_on_write():
copy = False
else:
copy = True

return self.apply(
"convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs
)

def get_values_for_csv(
self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None
) -> Self:
Expand Down
35 changes: 0 additions & 35 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
LossySetitemError,
convert_dtypes,
maybe_box_native,
maybe_cast_pointwise_result,
)
Expand Down Expand Up @@ -167,7 +166,6 @@
CorrelationMethod,
DropKeep,
Dtype,
DtypeBackend,
DtypeObj,
FilePath,
Frequency,
Expand Down Expand Up @@ -5556,39 +5554,6 @@ def between(

return lmask & rmask

# ----------------------------------------------------------------------
# Convert to types that support pd.NA

def _convert_dtypes(
self,
infer_objects: bool = True,
convert_string: bool = True,
convert_integer: bool = True,
convert_boolean: bool = True,
convert_floating: bool = True,
dtype_backend: DtypeBackend = "numpy_nullable",
) -> Series:
input_series = self
if infer_objects:
input_series = input_series.infer_objects()
if is_object_dtype(input_series.dtype):
input_series = input_series.copy(deep=None)

if convert_string or convert_integer or convert_boolean or convert_floating:
inferred_dtype = convert_dtypes(
input_series._values,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
infer_objects,
dtype_backend,
)
result = input_series.astype(inferred_dtype)
else:
result = input_series.copy(deep=None)
return result

# error: Cannot determine type of 'isna'
@doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
def isna(self) -> Series:
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,17 @@ def test_convert_dtypes_pyarrow_timestamp(self):
expected = ser.astype("timestamp[ms][pyarrow]")
result = expected.convert_dtypes(dtype_backend="pyarrow")
tm.assert_series_equal(result, expected)

def test_convert_dtypes_avoid_block_splitting(self):
# GH#55341
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
result = df.convert_dtypes(convert_integer=False)
expected = pd.DataFrame(
{
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": pd.Series(["a"] * 3, dtype="string[python]"),
}
)
tm.assert_frame_equal(result, expected)
assert result._mgr.nblocks == 2