Skip to content

REF: implement make_na_array #43606

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 16, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 46 additions & 52 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

import numpy as np

from pandas._libs import internals as libinternals
from pandas._libs import (
NaT,
internals as libinternals,
)
from pandas._typing import (
ArrayLike,
DtypeObj,
Expand Down Expand Up @@ -382,59 +385,21 @@ def is_na(self) -> bool:
return True
return False

def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
values: ArrayLike

if upcasted_na is None and not self.is_na:
# No upcasting is necessary
fill_value = self.block.fill_value
values = self.block.get_values()
if self.is_na:
return make_na_array(empty_dtype, self.shape)

else:
fill_value = upcasted_na

if self.is_na:

if is_datetime64tz_dtype(empty_dtype):
i8values = np.full(self.shape, fill_value.value)
return DatetimeArray(i8values, dtype=empty_dtype)

elif is_1d_only_ea_dtype(empty_dtype):
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
elif isinstance(empty_dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
cls = empty_dtype.construct_array_type()
missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype)
missing_arr[:] = fill_value
return missing_arr
else:
# NB: we should never get here with empty_dtype integer or bool;
# if we did, the missing_arr.fill would cast to gibberish
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
return missing_arr

if (not self.indexers) and (not self.block._can_consolidate):
# preserve these for validation in concat_compat
return self.block.values

if self.block.is_bool:
# External code requested filling/upcasting, bool values must
# be upcasted to object to avoid being upcasted to numeric.
values = self.block.astype(np.object_).values
else:
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
values = self.block.values
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
values = self.block.values

if not self.indexers:
# If there's no indexing to be done, we want to signal outside
Expand All @@ -449,6 +414,40 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
return values


def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool, FYI https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/cast.py#L1850 are somewhat similar in function (though not na)

"""
Construct an np.ndarray or ExtensionArray of the given dtype and shape
holding all-NA values.
"""
if is_datetime64tz_dtype(dtype):
# NaT here is analogous to dtype.na_value below
i8values = np.full(shape, NaT.value)
return DatetimeArray(i8values, dtype=dtype)

elif is_1d_only_ea_dtype(dtype):
dtype = cast(ExtensionDtype, dtype)
cls = dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=dtype)
nrows = shape[-1]
taker = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(taker, allow_fill=True, fill_value=dtype.na_value)
elif isinstance(dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
cls = dtype.construct_array_type()
missing_arr = cls._empty(shape=shape, dtype=dtype)
missing_arr[:] = dtype.na_value
return missing_arr
else:
# NB: we should never get here with dtype integer or bool;
# if we did, the missing_arr.fill would cast to gibberish
missing_arr = np.empty(shape, dtype=dtype)
fill_value = _dtype_to_na_value(dtype)
missing_arr.fill(fill_value)
return missing_arr


def _concatenate_join_units(
join_units: list[JoinUnit], concat_axis: int, copy: bool
) -> ArrayLike:
Expand All @@ -461,12 +460,7 @@ def _concatenate_join_units(

empty_dtype = _get_empty_dtype(join_units)

upcasted_na = _dtype_to_na_value(empty_dtype)

to_concat = [
ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
for ju in join_units
]
to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype) for ju in join_units]

if len(to_concat) == 1:
# Only one block, nothing to concatenate.
Expand Down