Skip to content

PERF: improve access of .array #31037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 24, 2020
3 changes: 1 addition & 2 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class PandasDtype(ExtensionDtype):
def __init__(self, dtype):
dtype = np.dtype(dtype)
self._dtype = dtype
self._name = dtype.name
self._type = dtype.type

def __repr__(self) -> str:
Expand All @@ -56,7 +55,7 @@ def numpy_dtype(self):

@property
def name(self):
return self._name
return self._dtype.name

@property
def type(self):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/sparse/scipy_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def _check_is_partition(parts, whole):


def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
""" For arbitrary (MultiIndexed) SparseSeries return
""" For arbitrary (MultiIndexed) sparse Series return
(v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
passing to scipy.sparse.coo constructor. """
# index and column levels must be a partition of the index
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))

# from the SparseSeries: get the labels and data for non-null entries
values = ss._data.internal_values()._valid_sp_values
# from the sparse Series: get the labels and data for non-null entries
values = ss.array._valid_sp_values

nonnull_labels = ss.dropna()

Expand Down Expand Up @@ -85,7 +85,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):

def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Convert a SparseSeries to a scipy.sparse.coo_matrix using index
Convert a sparse Series to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
labels respectively. Returns the sparse_matrix, row and column labels.
"""
Expand Down
23 changes: 1 addition & 22 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@
from pandas.core.dtypes.cast import is_nested_object
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64_ns_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_object_dtype,
is_scalar,
is_timedelta64_ns_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
Expand Down Expand Up @@ -747,26 +745,7 @@ def array(self) -> ExtensionArray:
[a, b, a]
Categories (2, object): [a, b]
"""
# As a mixin, we depend on the mixing class having _values.
# Special mixin syntax may be developed in the future:
# https://github.com/python/typing/issues/246
result = self._values # type: ignore

if is_datetime64_ns_dtype(result.dtype):
from pandas.arrays import DatetimeArray

result = DatetimeArray(result)
elif is_timedelta64_ns_dtype(result.dtype):
from pandas.arrays import TimedeltaArray

result = TimedeltaArray(result)

elif not is_extension_array_dtype(result.dtype):
from pandas.core.arrays.numpy_ import PandasArray

result = PandasArray(result)

return result
raise AbstractMethodError(self)

def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs):
"""
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3639,26 +3639,37 @@ def values(self):
"""
return self._data.view(np.ndarray)

@Appender(IndexOpsMixin.array.__doc__) # type: ignore
@property
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be a cached_property I think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the Index is immutable but not the underlying, hmm.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the Index is immutable but not the underlying, hmm.

That has always been the case for Index?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We fixed the issue with cache_readonly docs not working right? If so, +1 for using it as long as we're micro-optimizing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when putting them in the order of

    @cache_readonly
    @Appender(IndexOpsMixin.array.__doc__)  # type: ignore
    def array(self) -> ExtensionArray:

it seems to work

def array(self) -> ExtensionArray:
array = self._data
if isinstance(array, np.ndarray):
from pandas.core.arrays.numpy_ import PandasArray

array = PandasArray(array)
return array

@property
def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]:
# TODO(EA): remove index types as they become extension arrays
"""
The best array representation.

This is an ndarray, ExtensionArray, or Index subclass. This differs
This is an ndarray or ExtensionArray. This differs
from ``_ndarray_values``, which always returns an ndarray.

Both ``_values`` and ``_ndarray_values`` are consistent between
``Series`` and ``Index``.
``Series`` and ``Index`` (except for datetime64[ns], which returns
a DatetimeArray for _values on the Index, but ndarray[M8ns] on the
Series).

It may differ from the public '.values' method.

index | values | _values | _ndarray_values |
----------------- | --------------- | ------------- | --------------- |
Index | ndarray | ndarray | ndarray |
CategoricalIndex | Categorical | Categorical | ndarray[int] |
DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] |
DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] |
DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] |
DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] |
PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] |
IntervalIndex | IntervalArray | IntervalArray | ndarray[object] |

Expand Down
4 changes: 0 additions & 4 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,10 +405,6 @@ def values(self):
"""
return self._data

@cache_readonly
def _values(self):
return self._data

def __array_wrap__(self, result, context=None):
# we don't want the superclass implementation
return result
Expand Down
45 changes: 37 additions & 8 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,14 @@
)

import pandas.core.algorithms as algos
from pandas.core.arrays import Categorical, DatetimeArray, PandasDtype, TimedeltaArray
from pandas.core.arrays import (
Categorical,
DatetimeArray,
ExtensionArray,
PandasArray,
PandasDtype,
TimedeltaArray,
)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
Expand Down Expand Up @@ -192,16 +199,29 @@ def is_categorical_astype(self, dtype):

return False

def external_values(self, dtype=None):
""" return an outside world format, currently just the ndarray """
def external_values(self):
"""
The array that Series.values returns (public attribute).

This has some historical constraints, and is overridden in block
subclasses to return the correct array (e.g. period returns
object ndarray and datetimetz a datetime64[ns] ndarray instead of
proper extension array).
"""
return self.values

def internal_values(self, dtype=None):
""" return an internal format, currently just the ndarray
this should be the pure internal API format
def internal_values(self):
"""
The array that Series._values returns (internal values).
"""
return self.values

def array_values(self) -> ExtensionArray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need external_values at all? this is getting very confusing with all of the values

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need external_values at all? this is getting very confusing with all of the values

I clarified the docstrings (in this PR, but split off and already merged in #31103) to make it clear what external_values (-> .values), internal_values (-> _values) and array_values (-> .array) are used for. As long as those all have different behaviour (something that can be gradually cleaned up, but not here), we need a mechanism to create those different results.

"""
The array that Series.array returns. Always an ExtensionArray.
"""
return PandasArray(self.values)

def get_values(self, dtype=None):
"""
return an internal format, currently just the ndarray
Expand Down Expand Up @@ -1771,6 +1791,9 @@ def get_values(self, dtype=None):
values = values.reshape((1,) + values.shape)
return values

def array_values(self) -> ExtensionArray:
return self.values

def to_dense(self):
return np.asarray(self.values)

Expand Down Expand Up @@ -1966,7 +1989,7 @@ class ObjectValuesExtensionBlock(ExtensionBlock):
Series[T].values is an ndarray of objects.
"""

def external_values(self, dtype=None):
def external_values(self):
return self.values.astype(object)


Expand Down Expand Up @@ -2228,6 +2251,9 @@ def set(self, locs, values):
def external_values(self):
return np.asarray(self.values.astype("datetime64[ns]", copy=False))

def array_values(self) -> ExtensionArray:
return DatetimeArray._simple_new(self.values)


class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
Expand Down Expand Up @@ -2482,9 +2508,12 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
)
return rvalues

def external_values(self, dtype=None):
def external_values(self):
return np.asarray(self.values.astype("timedelta64[ns]", copy=False))

def array_values(self) -> ExtensionArray:
return TimedeltaArray._simple_new(self.values)


class BoolBlock(NumericBlock):
__slots__ = ()
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1560,9 +1560,11 @@ def get_dtypes(self):
return np.array([self._block.dtype])

def external_values(self):
"""The array that Series.values returns"""
return self._block.external_values()

def internal_values(self):
"""The array that Series._values returns"""
return self._block.internal_values()

def get_values(self):
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,11 @@ def _values(self):
"""
return self._data.internal_values()

@Appender(base.IndexOpsMixin.array.__doc__) # type: ignore
@property
def array(self) -> ExtensionArray:
return self._data._block.array_values()

def _internal_get_values(self):
"""
Same as values (but handles sparseness conversions); is a view.
Expand Down