Skip to content

Add mean keyword to dpnp.std and dpnp.var #2271

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/build-sphinx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ jobs:
PR_NUM: ${{ github.event.number }}
uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
with:
message-id: url_to_docs
message: |
View rendered docs @ https://intelpython.github.io/dpnp/pull/${{ env.PR_NUM }}/index.html
allow-repeats: false
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/conda-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -600,9 +600,9 @@ jobs:
if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2.8.2
with:
message-id: array_api_results
message: |
${{ env.MESSAGE }}
refresh-message-position: true

cleanup_packages:
name: Clean up anaconda packages
Expand Down
10 changes: 8 additions & 2 deletions dpnp/dpnp_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1731,6 +1731,7 @@ def std(
keepdims=False,
*,
where=True,
mean=None,
):
"""
Returns the standard deviation of the array elements, along given axis.
Expand All @@ -1739,7 +1740,9 @@ def std(

"""

return dpnp.std(self, axis, dtype, out, ddof, keepdims, where=where)
return dpnp.std(
self, axis, dtype, out, ddof, keepdims, where=where, mean=mean
)

@property
def strides(self):
Expand Down Expand Up @@ -1938,6 +1941,7 @@ def var(
keepdims=False,
*,
where=True,
mean=None,
):
"""
Returns the variance of the array elements, along given axis.
Expand All @@ -1946,7 +1950,9 @@ def var(

"""

return dpnp.var(self, axis, dtype, out, ddof, keepdims, where=where)
return dpnp.var(
self, axis, dtype, out, ddof, keepdims, where=where, mean=mean
)


# 'view'
197 changes: 141 additions & 56 deletions dpnp/dpnp_iface_nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

"""

# pylint: disable=duplicate-code

import warnings

import dpnp
Expand Down Expand Up @@ -955,7 +957,15 @@ def nansum(


def nanstd(
a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True
a,
axis=None,
dtype=None,
out=None,
ddof=0,
keepdims=False,
*,
where=True,
mean=None,
):
"""
Compute the standard deviation along the specified axis,
Expand All @@ -969,40 +979,52 @@ def nanstd(
Input array.
axis : {None, int, tuple of ints}, optional
Axis or axes along which the standard deviations must be computed.
If a tuple of unique integers is given, the standard deviations
are computed over multiple axes. If ``None``, the standard deviation
is computed over the entire array.
If a tuple of unique integers is given, the standard deviations are
computed over multiple axes. If ``None``, the standard deviation is
computed over the entire array.

Default: ``None``.
dtype : {None, dtype}, optional
Type to use in computing the standard deviation. By default,
if `a` has a floating-point data type, the returned array
will have the same data type as `a`.
If `a` has a boolean or integral data type, the returned array
will have the default floating point data type for the device
Type to use in computing the standard deviation. By default, if `a` has
a floating-point data type, the returned array will have the same data
type as `a`. If `a` has a boolean or integral data type, the returned
array will have the default floating point data type for the device
where input array `a` is allocated.

Default: ``None``.
out : {None, dpnp.ndarray, usm_ndarray}, optional
Alternative output array in which to place the result. It must have
the same shape as the expected output but the type (of the calculated
values) will be cast if necessary.

Default: ``None``.
ddof : {int, float}, optional
Means Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` the number of non-NaN elements.
Default: `0.0`.
Means Delta Degrees of Freedom. The divisor used in calculations is
``N - ddof``, where ``N`` the number of non-NaN elements.

Default: ``0.0``.
keepdims : {None, bool}, optional
If ``True``, the reduced axes (dimensions) are included in the result
as singleton dimensions, so that the returned array remains
compatible with the input array according to Array Broadcasting
rules. Otherwise, if ``False``, the reduced axes are not included in
the returned array. Default: ``False``.
as singleton dimensions, so that the returned array remains compatible
with the input array according to Array Broadcasting rules. Otherwise,
if ``False``, the reduced axes are not included in the returned array.

Default: ``False``.
mean : {dpnp.ndarray, usm_ndarray}, optional
Provide the mean to prevent its recalculation. The mean should have
a shape as if it was calculated with ``keepdims=True``.
The axis for the calculation of the mean should be the same as used in
the call to this `nanstd` function.

Default: ``None``.

Returns
-------
out : dpnp.ndarray
An array containing the standard deviations. If the standard
deviation was computed over the entire array, a zero-dimensional
array is returned. If `ddof` is >= the number of non-NaN elements
in a slice or the slice contains only NaNs, then the result for
that slice is NaN.
An array containing the standard deviations. If the standard deviation
was computed over the entire array, a zero-dimensional array is
returned. If `ddof` is >= the number of non-NaN elements in a slice or
the slice contains only NaNs, then the result for that slice is NaN.

Limitations
-----------
Expand All @@ -1011,6 +1033,19 @@ def nanstd(

Notes
-----
The standard deviation is the square root of the average of the squared
deviations from the mean: ``std = sqrt(mean(abs(x - x.mean())**2))``.

The average squared deviation is normally calculated as ``x.sum() / N``,
where ``N = len(x)``. If, however, `ddof` is specified, the divisor
``N - ddof`` is used instead. In standard statistical practice, ``ddof=1``
provides an unbiased estimator of the variance of the infinite population.
``ddof=0`` provides a maximum likelihood estimate of the variance for
normally distributed variables.
The standard deviation computed in this function is the square root of
the estimated variance, so even with ``ddof=1``, it will not be an unbiased
estimate of the standard deviation per se.

Note that, for complex numbers, the absolute value is taken before
squaring, so that the result is always real and non-negative.

Expand All @@ -1029,11 +1064,18 @@ def nanstd(
>>> import dpnp as np
>>> a = np.array([[1, np.nan], [3, 4]])
>>> np.nanstd(a)
array(1.247219128924647)
array(1.24721913)
>>> np.nanstd(a, axis=0)
array([1., 0.])
array([1., 0.])
>>> np.nanstd(a, axis=1)
array([0., 0.5]) # may vary
array([0. , 0.5]) # may vary

Using the mean keyword to save computation time:

>>> a = np.array([[14, 8, np.nan, 10], [7, 9, 10, 11], [np.nan, 15, 5, 10]])
>>> mean = np.nanmean(a, axis=1, keepdims=True)
>>> np.nanstd(a, axis=1, mean=mean)
array([2.49443826, 1.47901995, 4.0824829 ])

"""

Expand All @@ -1051,13 +1093,21 @@ def nanstd(
ddof=ddof,
keepdims=keepdims,
where=where,
mean=mean,
)
dpnp.sqrt(res, out=res)
return res
return dpnp.sqrt(res, out=res)


def nanvar(
a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True
a,
axis=None,
dtype=None,
out=None,
ddof=0,
keepdims=False,
*,
where=True,
mean=None,
):
"""
Compute the variance along the specified axis, while ignoring NaNs.
Expand All @@ -1069,39 +1119,52 @@ def nanvar(
a : {dpnp.ndarray, usm_ndarray}
Input array.
axis : {None, int, tuple of ints}, optional
axis or axes along which the variances must be computed. If a tuple
Axis or axes along which the variances must be computed. If a tuple
of unique integers is given, the variances are computed over multiple
axes. If ``None``, the variance is computed over the entire array.

Default: ``None``.
dtype : {None, dtype}, optional
Type to use in computing the variance. By default, if `a` has a
floating-point data type, the returned array will have
the same data type as `a`.
If `a` has a boolean or integral data type, the returned array
will have the default floating point data type for the device
where input array `a` is allocated.
the same data type as `a`. If `a` has a boolean or integral data type,
the returned array will have the default floating point data type for
the device where input array `a` is allocated.

Default: ``None``.
out : {None, dpnp.ndarray, usm_ndarray}, optional
Alternative output array in which to place the result. It must have
the same shape as the expected output but the type (of the calculated
values) will be cast if necessary.

Default: ``None``.
ddof : {int, float}, optional
Means Delta Degrees of Freedom. The divisor used in calculations
is ``N - ddof``, where ``N`` represents the number of non-NaN elements.
Default: `0.0`.
Means Delta Degrees of Freedom. The divisor used in calculations is
``N - ddof``, where ``N`` represents the number of non-NaN elements.

Default: ``0.0``.
keepdims : {None, bool}, optional
If ``True``, the reduced axes (dimensions) are included in the result
as singleton dimensions, so that the returned array remains
compatible with the input array according to Array Broadcasting
rules. Otherwise, if ``False``, the reduced axes are not included in
the returned array. Default: ``False``.
as singleton dimensions, so that the returned array remains compatible
with the input array according to Array Broadcasting rules. Otherwise,
if ``False``, the reduced axes are not included in the returned array.

Default: ``False``.
mean : {dpnp.ndarray, usm_ndarray}, optional
Provide the mean to prevent its recalculation. The mean should have
a shape as if it was calculated with ``keepdims=True``.
The axis for the calculation of the mean should be the same as used in
the call to this `nanvar` function.

Default: ``None``.

Returns
-------
out : dpnp.ndarray
An array containing the variances. If the variance was computed
over the entire array, a zero-dimensional array is returned.
If `ddof` is >= the number of non-NaN elements in a slice or the
slice contains only NaNs, then the result for that slice is NaN.
An array containing the variances. If the variance was computed over
the entire array, a zero-dimensional array is returned. If `ddof` is >=
the number of non-NaN elements in a slice or the slice contains only
NaNs, then the result for that slice is NaN.

Limitations
-----------
Expand All @@ -1110,6 +1173,16 @@ def nanvar(

Notes
-----
The variance is the average of the squared deviations from the mean,
that is ``var = mean(abs(x - x.mean())**2)``.

The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
If, however, `ddof` is specified, the divisor ``N - ddof`` is used instead.
In standard statistical practice, ``ddof=1`` provides an unbiased estimator
of the variance of a hypothetical infinite population. ``ddof=0`` provides
a maximum likelihood estimate of the variance for normally distributed
variables.

Note that, for complex numbers, the absolute value is taken before squaring,
so that the result is always real and non-negative.

Expand All @@ -1127,11 +1200,18 @@ def nanvar(
>>> import dpnp as np
>>> a = np.array([[1, np.nan], [3, 4]])
>>> np.nanvar(a)
array(1.5555555555555554)
array(1.55555556)
>>> np.nanvar(a, axis=0)
array([1., 0.])
array([1., 0.])
>>> np.nanvar(a, axis=1)
array([0., 0.25]) # may vary
array([0. , 0.25]) # may vary

Using the mean keyword to save computation time:

>>> a = np.array([[14, 8, np.nan, 10], [7, 9, 10, 11], [np.nan, 15, 5, 10]])
>>> mean = np.nanmean(a, axis=1, keepdims=True)
>>> np.nanvar(a, axis=1, mean=mean)
array([ 6.22222222, 2.1875 , 16.66666667])

"""

Expand All @@ -1157,46 +1237,51 @@ def nanvar(
dtype = dpnp.dtype(dtype)
if not dpnp.issubdtype(dtype, dpnp.inexact):
raise TypeError("If input is inexact, then dtype must be inexact.")

if out is not None:
dpnp.check_supported_arrays_type(out)
if not dpnp.issubdtype(out.dtype, dpnp.inexact):
raise TypeError("If input is inexact, then out must be inexact.")

# Compute mean
var_dtype = a.real.dtype if dtype is None else dtype
cnt = dpnp.sum(
~mask, axis=axis, dtype=var_dtype, keepdims=True, where=where
~mask, axis=axis, dtype=dpnp.intp, keepdims=True, where=where
)
avg = dpnp.sum(arr, axis=axis, dtype=dtype, keepdims=True, where=where)
avg = dpnp.divide(avg, cnt, out=avg)

# Compute squared deviation from mean.
if mean is not None:
avg = mean
else:
avg = dpnp.sum(arr, axis=axis, dtype=dtype, keepdims=True, where=where)
avg = dpnp.divide(avg, cnt, out=avg)

# Compute squared deviation from mean
if arr.dtype == avg.dtype:
arr = dpnp.subtract(arr, avg, out=arr)
else:
arr = dpnp.subtract(arr, avg)
dpnp.copyto(arr, 0.0, where=mask)

if dpnp.issubdtype(arr.dtype, dpnp.complexfloating):
sqr = dpnp.multiply(arr, arr.conj(), out=arr).real
else:
sqr = dpnp.multiply(arr, arr, out=arr)
sqr = dpnp.square(arr, out=arr)

# Compute variance
var = dpnp.sum(
sqr,
axis=axis,
dtype=var_dtype,
dtype=dtype,
out=out,
keepdims=keepdims,
where=where,
)

if var.ndim < cnt.ndim:
cnt = cnt.squeeze(axis)
cnt -= ddof
dpnp.divide(var, cnt, out=var)
dof = cnt - ddof
dpnp.divide(var, dof, out=var)

isbad = cnt <= 0
isbad = dof <= 0
if dpnp.any(isbad):
# NaN, inf, or negative numbers are all possible bad
# values, so explicitly replace them with NaN.
Expand Down
Loading
Loading