Skip to content

REF: NDFrame describe #36833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
159 changes: 10 additions & 149 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,7 @@
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError, InvalidIndexError
from pandas.util._decorators import doc, rewrite_axis_style_signature
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_percentile,
)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs

from pandas.core.dtypes.common import (
ensure_int64,
Expand Down Expand Up @@ -109,11 +105,8 @@
from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window

from pandas.io.formats import format as fmt
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
format_percentiles,
)
from pandas.io.formats.describe import describe_ndframe
from pandas.io.formats.format import DataFrameFormatter, DataFrameRenderer
from pandas.io.formats.printing import pprint_thing

if TYPE_CHECKING:
Expand Down Expand Up @@ -10237,145 +10230,13 @@ def describe(
75% NaN 2.5
max NaN 3.0
"""
if self.ndim == 2 and self.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")

if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)

# get them all to be in [0, 1]
validate_percentile(percentiles)

# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])

# sort and check for duplicates
unique_pcts = np.unique(percentiles)
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts

formatted_percentiles = format_percentiles(percentiles)

def describe_numeric_1d(series) -> "Series":
stat_index = (
["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
)
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return pd.Series(d, index=stat_index, name=series.name)

def describe_categorical_1d(data) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"

return pd.Series(result, index=names, name=data.name, dtype=dtype)

def describe_timestamp_1d(data) -> "Series":
# GH-30164
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return pd.Series(d, index=stat_index, name=data.name)

def describe_1d(data) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
else:
return describe_categorical_1d(data)

if self.ndim == 1:
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(self) # type:ignore[return-value]
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = self.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self
elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self
else:
data = self.select_dtypes(include=include, exclude=exclude)

ldesc = [describe_1d(s) for _, s in data.items()]
# set a convenient order for rows
names: List[Label] = []
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in names:
names.append(name)

d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d
return describe_ndframe(
data=self,
include=include,
exclude=exclude,
datetime_is_numeric=datetime_is_numeric,
percentiles=percentiles,
)

@final
def pct_change(
Expand Down
Loading