Skip to content

PERF: optimized median func when bottleneck not present #16509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jan 22, 2018
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ Performance Improvements
- Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`)
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)


.. _whatsnew_0230.docs:

Expand Down
13 changes: 0 additions & 13 deletions pandas/_libs/algos.pxd

This file was deleted.

51 changes: 1 addition & 50 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ from cython cimport Py_ssize_t

np.import_array()

cdef float64_t FP_ERR = 1e-13

cimport util

from libc.stdlib cimport malloc, free
Expand All @@ -24,6 +22,7 @@ from numpy cimport (ndarray,
double_t)


cdef float64_t FP_ERR = 1e-13
cdef double NaN = <double> np.NaN
cdef double nan = NaN

Expand Down Expand Up @@ -166,54 +165,6 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups):
return result, counts


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil:
cdef:
Py_ssize_t i, j, l, m, n = a.shape[0]
numeric x

with nogil:
l = 0
m = n - 1

while l < m:
x = a[k]
i = l
j = m

while 1:
while a[i] < x: i += 1
while x < a[j]: j -= 1
if i <= j:
swap(&a[i], &a[j])
i += 1; j -= 1

if i > j: break

if j < k: l = i
if k < i: m = j
return a[k]


cpdef numeric median(numeric[:] arr):
"""
A faster median
"""
cdef Py_ssize_t n = arr.size

if n == 0:
return np.NaN

arr = arr.copy()

if n % 2:
return kth_smallest(arr, n // 2)
else:
return (kth_smallest(arr, n // 2) +
kth_smallest(arr, n // 2 - 1)) / 2


# ----------------------------------------------------------------------
# Pairwise correlation/covariance

Expand Down
11 changes: 10 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ from numpy cimport (ndarray,
from libc.stdlib cimport malloc, free

from util cimport numeric, get_nat
from algos cimport swap
from algos import take_2d_axis1_float64_float64, groupsort_indexer

cdef int64_t iNaT = get_nat()
Expand All @@ -25,6 +24,16 @@ cdef double NaN = <double> np.NaN
cdef double nan = NaN


cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
cdef numeric t

# cython doesn't allow pointer dereference so use array syntax
t = a[0]
a[0] = b[0]
b[0] = t
return 0


# TODO: aggregate multiple columns in single pass
# ----------------------------------------------------------------------
# first, nth, last
Expand Down
16 changes: 10 additions & 6 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import numpy as np
from pandas import compat
from pandas._libs import tslib, algos, lib
from pandas._libs import tslib, lib
from pandas.core.dtypes.common import (
_get_dtype,
is_float, is_scalar,
Expand Down Expand Up @@ -370,14 +370,13 @@ def nanmean(values, axis=None, skipna=True):
@bottleneck_switch()
def nanmedian(values, axis=None, skipna=True):

values, mask, dtype, dtype_max = _get_values(values, skipna)

def get_median(x):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and this will break in numpy < 1.9, you need to do this conditionally.

mask = notna(x)
if not skipna and not mask.all():
return np.nan
return algos.median(com._values_from_object(x[mask]))
return np.nanmedian(x[mask])

values, mask, dtype, dtype_max = _get_values(values, skipna)
if not is_float_dtype(values):
values = values.astype('f8')
values[mask] = np.nan
Expand All @@ -389,10 +388,15 @@ def get_median(x):

# an array from a frame
if values.ndim > 1:

# there's a non-empty array to apply over otherwise numpy raises
if notempty:
return _wrap_results(
np.apply_along_axis(get_median, axis, values), dtype)
if not skipna:
return _wrap_results(
np.apply_along_axis(get_median, axis, values), dtype)

# fastpath for the skipna case
return _wrap_results(np.nanmedian(values, axis), dtype)

# must return the correct shape, but median is not defined for the
# empty set so return nans of shape "everything but the passed axis"
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,11 +460,11 @@ def pxd(name):
ext_data = {
'_libs.algos': {
'pyxfile': '_libs/algos',
'pxdfiles': ['_libs/src/util', '_libs/algos', '_libs/hashtable'],
'pxdfiles': ['_libs/src/util', '_libs/hashtable'],
'depends': _pxi_dep['algos']},
'_libs.groupby': {
'pyxfile': '_libs/groupby',
'pxdfiles': ['_libs/src/util', '_libs/algos'],
'pxdfiles': ['_libs/src/util'],
'depends': _pxi_dep['groupby']},
'_libs.hashing': {
'pyxfile': '_libs/hashing'},
Expand Down