Skip to content

Replace (private) _checknull with (public) is_nan #22146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 7, 2018
9 changes: 4 additions & 5 deletions pandas/_libs/hashing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
# at https://github.com/veorq/SipHash

import cython
from cpython cimport PyBytes_Check, PyUnicode_Check
from libc.stdlib cimport malloc, free

import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t

from util cimport _checknull
from cpython cimport (PyBytes_Check,
PyUnicode_Check)
from libc.stdlib cimport malloc, free
from util cimport is_nan

DEF cROUNDS = 2
DEF dROUNDS = 4
Expand Down Expand Up @@ -65,7 +64,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
elif _checknull(val):
elif val is None or is_nan(val):
# null, stringify and encode
data = <bytes>str(val).encode(encoding)

Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/interval.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ cpdef intervals_to_interval_bounds(ndarray intervals,

for i in range(len(intervals)):
interval = intervals[i]
if util._checknull(interval):
if interval is None or util.is_nan(interval):
left[i] = np.nan
right[i] = np.nan
continue
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ from missing cimport checknull

cimport util
cdef int64_t NPY_NAT = util.get_nat()
from util cimport is_array, _checknull
from util cimport is_array, is_nan


def values_from_object(object o):
Expand Down Expand Up @@ -429,7 +429,7 @@ cpdef bint array_equivalent_object(object[:] left, object[:] right):
# we are either not equal or both nan
# I think None == None will be true here
if not (PyObject_RichCompareBool(x, y, Py_EQ) or
_checknull(x) and _checknull(y)):
(x is None or is_nan(x)) and (y is None or is_nan(y))):
return False
return True

Expand Down
10 changes: 5 additions & 5 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ cpdef bint checknull(object val):
elif util.is_array(val):
return False
else:
return util._checknull(val)
return val is None or util.is_nan(val)


cpdef bint checknull_old(object val):
Expand Down Expand Up @@ -113,7 +113,7 @@ cpdef bint checknull_old(object val):
elif util.is_array(val):
return False
else:
return util._checknull(val)
return val is None or util.is_nan(val)


cdef inline bint _check_none_nan_inf_neginf(object val):
Expand Down Expand Up @@ -297,7 +297,7 @@ cpdef bint isneginf_scalar(object val):
cdef inline bint is_null_datetime64(v):
# determine if we have a null for a datetime (or integer versions),
# excluding np.timedelta64('nat')
if util._checknull(v):
if v is None or util.is_nan(v):
return True
elif v is NaT:
return True
Expand All @@ -309,7 +309,7 @@ cdef inline bint is_null_datetime64(v):
cdef inline bint is_null_timedelta64(v):
# determine if we have a null for a timedelta (or integer versions),
# excluding np.datetime64('nat')
if util._checknull(v):
if v is None or util.is_nan(v):
return True
elif v is NaT:
return True
Expand All @@ -321,7 +321,7 @@ cdef inline bint is_null_timedelta64(v):
cdef inline bint is_null_period(v):
# determine if we have a null for a Period (or integer versions),
# excluding np.datetime64('nat') and np.timedelta64('nat')
if util._checknull(v):
if v is None or util.is_nan(v):
return True
elif v is NaT:
return True
Expand Down
10 changes: 5 additions & 5 deletions pandas/_libs/ops.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import numpy as np
from numpy cimport ndarray, uint8_t


from util cimport UINT8_MAX, _checknull
from util cimport UINT8_MAX, is_nan

from missing cimport checknull

Expand Down Expand Up @@ -190,13 +190,13 @@ def scalar_binop(ndarray[object] values, object val, object op):
object x

result = np.empty(n, dtype=object)
if _checknull(val):
if val is None or is_nan(val):
result.fill(val)
return result

for i in range(n):
x = values[i]
if _checknull(x):
if x is None or is_nan(x):
result[i] = x
else:
result[i] = op(x, val)
Expand Down Expand Up @@ -237,9 +237,9 @@ def vec_binop(ndarray[object] left, ndarray[object] right, object op):
try:
result[i] = op(x, y)
except TypeError:
if _checknull(x):
if x is None or is_nan(x):
result[i] = x
elif _checknull(y):
elif y is None or is_nan(y):
result[i] = y
else:
raise
Expand Down
9 changes: 5 additions & 4 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ from pandas.core.dtypes.common import (
pandas_dtype)
from pandas.core.arrays import Categorical
from pandas.core.dtypes.concat import union_categoricals
import pandas.io.common as com
import pandas.io.common as icom

from pandas.errors import (ParserError, DtypeWarning,
EmptyDataError, ParserWarning)
Expand Down Expand Up @@ -665,7 +665,8 @@ cdef class TextReader:
if b'utf-16' in (self.encoding or b''):
# we need to read utf-16 through UTF8Recoder.
# if source is utf-16, convert source to utf-8 by UTF8Recoder.
source = com.UTF8Recoder(source, self.encoding.decode('utf-8'))
source = icom.UTF8Recoder(source,
self.encoding.decode('utf-8'))
self.encoding = b'utf-8'
self.c_encoding = <char*> self.encoding

Expand Down Expand Up @@ -1356,7 +1357,7 @@ cdef asbytes(object o):
# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
_NA_VALUES = _ensure_encoded(list(com._NA_VALUES))
_NA_VALUES = _ensure_encoded(list(icom._NA_VALUES))


def _maybe_upcast(arr):
Expand Down Expand Up @@ -2247,7 +2248,7 @@ def sanitize_objects(ndarray[object] values, set na_values,
n = len(values)
onan = np.nan

for i from 0 <= i < n:
for i in range(n):
val = values[i]
if (convert_empty and val == '') or (val in na_values):
values[i] = onan
Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/skiplist.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Link: http://code.activestate.com/recipes/576930/

# Cython version: Wes McKinney
from random import random

from libc.math cimport log

Expand All @@ -17,8 +18,6 @@ cdef double Log2(double x):
return log(x) / log(2.)


from random import random

# TODO: optimize this, make less messy

cdef class Node:
Expand All @@ -32,9 +31,11 @@ cdef class Node:
self.next = next
self.width = width


# Singleton terminator node
NIL = Node(np.inf, [], [])


cdef class IndexableSkiplist:
"""
Sorted collection supporting O(lg n) insertion, removal, and
Expand Down
8 changes: 4 additions & 4 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def infer_dtype(object value, bint skipna=False):

# do not use is_nul_datetimelike to keep
# np.datetime64('nat') and np.timedelta64('nat')
if util._checknull(val):
if val is None or util.is_nan(val):
pass
elif val is NaT:
seen_pdnat = True
Expand Down Expand Up @@ -522,7 +522,7 @@ cpdef object infer_datetimelike_array(object arr):
if len(objs) == 3:
break

elif util._checknull(v):
elif v is None or util.is_nan(v):
# nan or None
pass
elif v is NaT:
Expand Down Expand Up @@ -660,7 +660,7 @@ cdef class Validator:
)

cdef bint is_valid_null(self, object value) except -1:
return util._checknull(value)
return value is None or util.is_nan(value)

cdef bint is_array_typed(self) except -1:
return False
Expand Down Expand Up @@ -828,7 +828,7 @@ cdef class TemporalValidator(Validator):
cdef inline bint is_valid_skipna(self, object value) except -1:
cdef:
bint is_typed_null = self.is_valid_null(value)
bint is_generic_null = util._checknull(value)
bint is_generic_null = value is None or util.is_nan(value)
self.generic_null_count += is_typed_null and is_generic_null
return self.is_value_typed(value) or is_typed_null or is_generic_null

Expand Down
5 changes: 2 additions & 3 deletions pandas/_libs/tslibs/nattype.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,7 @@ NaT = NaTType()

cdef inline bint checknull_with_nat(object val):
""" utility to check if a value is a nat or not """
return val is None or (
PyFloat_Check(val) and val != val) or val is NaT
return val is None or util.is_nan(val) or val is NaT


cdef inline bint is_null_datetimelike(object val):
Expand All @@ -602,7 +601,7 @@ cdef inline bint is_null_datetimelike(object val):
-------
null_datetimelike : bool
"""
if util._checknull(val):
if val is None or util.is_nan(val):
return True
elif val is NaT:
return True
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,7 @@ class Timedelta(_Timedelta):
'{op}'.format(dtype=other.dtype,
op='__floordiv__'))

elif is_float_object(other) and util._checknull(other):
elif is_float_object(other) and util.is_nan(other):
# i.e. np.nan
return NotImplemented

Expand Down
15 changes: 13 additions & 2 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -228,5 +228,16 @@ cdef inline bint is_offset_object(object val):
return getattr(val, '_typ', None) == "dateoffset"


cdef inline bint _checknull(object val):
return val is None or (PyFloat_Check(val) and val != val)
cdef inline bint is_nan(object val):
"""
Check if val is a Not-A-Number float, including float('NaN') and np.nan.

Parameters
----------
val : object

Returns
-------
is_nan : bool
"""
return isinstance(val, float) and val != val
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be is_float_object (e.g. this excludes compex)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is functionally equivalent to the current implementation.

(The current version uses PyFloatCheck(val) instead of isinstance(val, float), but cython converts the latter to the former)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this handle complex as well? I would rather use the same check as we do elsewhere, as this is confusing when to use one vs the other.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this handle complex as well?

No. This also only checks for python float, as opposed to is_float_object which also checks for numpy float (the examples I've checked with isinstance(np.float64(2.3), float) have come back True, not sure what the Falsey case is).

_libs.missing.checknull begins with the check you might have in mind:

if is_float_object(val) or is_complex_object(val):
    return val != val
[...]

ATM _checknull is used in exactly 19 places, at least one of which is in a float-only branch. Based on a quick look it isn't obvious which usages (at least one) are specifically-for-float. I'll take a look.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking these over, I'm inclined to think it makes sense to use is_float_object but continue excluding complex. At least two of the usages are floating-specific, and complex is sufficiently corner-case-y that I'd rather check for it explicitly-or-not-at-all.

It also looks like checknull_with_nat and is_null_period may be unnecessary, will look at that in an upcoming pass.

2 changes: 1 addition & 1 deletion pandas/_libs/writers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def string_array_replace_from_nan_rep(
if replace is None:
replace = np.nan

for i from 0 <= i < length:
for i in range(length):
if arr[i] == nan_rep:
arr[i] = replace

Expand Down