Skip to content

Commit b48d1ff

Browse files
anmyachevjreback
authored andcommitted
PERF: cythonizing _concat_date_cols; conversion to float without exceptions in _does_string_look_like_datetime (#25754)
1 parent bc75a72 commit b48d1ff

File tree

8 files changed

+262
-39
lines changed

8 files changed

+262
-39
lines changed

asv_bench/benchmarks/io/csv.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,35 @@ def time_read_csv(self, infer_datetime_format, format):
9696
infer_datetime_format=infer_datetime_format)
9797

9898

99+
class ReadCSVConcatDatetime(StringIORewind):
100+
101+
iso8601 = '%Y-%m-%d %H:%M:%S'
102+
103+
def setup(self):
104+
rng = date_range('1/1/2000', periods=50000, freq='S')
105+
self.StringIO_input = StringIO('\n'.join(
106+
rng.strftime(self.iso8601).tolist()))
107+
108+
def time_read_csv(self):
109+
read_csv(self.data(self.StringIO_input),
110+
header=None, names=['foo'], parse_dates=['foo'],
111+
infer_datetime_format=False)
112+
113+
114+
class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
115+
116+
params = (['nan', '0', ''],)
117+
param_names = ['bad_date_value']
118+
119+
def setup(self, bad_date_value):
120+
self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000)
121+
122+
def time_read_csv(self, bad_date_value):
123+
read_csv(self.data(self.StringIO_input),
124+
header=None, names=['foo', 'bar'], parse_dates=['foo'],
125+
infer_datetime_format=False)
126+
127+
99128
class ReadCSVSkipRows(BaseIO):
100129

101130
fname = '__test__.csv'

asv_bench/benchmarks/io/parsers.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import numpy as np
2+
3+
from pandas._libs.tslibs.parsing import (
4+
_concat_date_cols, _does_string_look_like_datetime)
5+
6+
7+
class DoesStringLookLikeDatetime(object):
8+
9+
params = (['2Q2005', '0.0', '10000'],)
10+
param_names = ['value']
11+
12+
def setup(self, value):
13+
self.objects = [value] * 1000000
14+
15+
def time_check_datetimes(self, value):
16+
for obj in self.objects:
17+
_does_string_look_like_datetime(obj)
18+
19+
20+
class ConcatDateCols(object):
21+
22+
params = ([1234567890, 'AAAA'], [1, 2])
23+
param_names = ['value', 'dim']
24+
25+
def setup(self, value, dim):
26+
count_elem = 10000
27+
if dim == 1:
28+
self.object = (np.array([value] * count_elem),)
29+
if dim == 2:
30+
self.object = (np.array([value] * count_elem),
31+
np.array([value] * count_elem))
32+
33+
def time_check_concat(self, value, dim):
34+
_concat_date_cols(self.object)

doc/source/whatsnew/v0.25.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,8 @@ Performance Improvements
252252
- Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`)
253253
- Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`)
254254
- Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`)
255+
- Improved performance of :meth:`read_csv` by faster concatenating date columns without extra conversion to string for integer/float zero
256+
and float NaN; by faster checking the string for the possibility of being a date (:issue:`25754`)
255257

256258
.. _whatsnew_0250.bug_fixes:
257259

pandas/_libs/lib.pyx

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@ import warnings
88
import cython
99
from cython import Py_ssize_t
1010

11-
from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
12-
PyTuple_New,
13-
Py_EQ,
14-
PyObject_RichCompareBool)
11+
from cpython cimport (Py_INCREF, PyTuple_SET_ITEM, PyTuple_New, PyObject_Str,
12+
Py_EQ, Py_SIZE, PyObject_RichCompareBool,
13+
PyUnicode_Join, PyList_New)
1514

1615
from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
1716
PyTime_Check, PyDelta_Check,
@@ -23,10 +22,8 @@ cimport numpy as cnp
2322
from numpy cimport (ndarray, PyArray_GETITEM,
2423
PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew,
2524
flatiter, NPY_OBJECT,
26-
int64_t,
27-
float32_t, float64_t,
28-
uint8_t, uint64_t,
29-
complex128_t)
25+
int64_t, float32_t, float64_t,
26+
uint8_t, uint64_t, complex128_t)
3027
cnp.import_array()
3128

3229
cdef extern from "numpy/arrayobject.h":

pandas/_libs/tslibs/parsing.pyx

Lines changed: 168 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,19 @@ from io import StringIO
77

88
from libc.string cimport strchr
99

10+
import cython
11+
12+
from cpython cimport PyObject_Str, PyUnicode_Join
13+
1014
from cpython.datetime cimport datetime, datetime_new, import_datetime
1115
from cpython.version cimport PY_VERSION_HEX
1216
import_datetime()
1317

1418
import numpy as np
19+
cimport numpy as cnp
20+
from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT,
21+
PyArray_IterNew, flatiter, float64_t)
22+
cnp.import_array()
1523

1624
# dateutil compat
1725
from dateutil.tz import (tzoffset,
@@ -26,11 +34,16 @@ from pandas._config import get_option
2634

2735
from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
2836
from pandas._libs.tslibs.nattype import nat_strings, NaT
29-
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
37+
from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size
3038

3139
cdef extern from "../src/headers/portable.h":
3240
int getdigit_ascii(char c, int default) nogil
3341

42+
cdef extern from "../src/parser/tokenizer.h":
43+
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
44+
int skip_trailing, int *error, int *maybe_int)
45+
46+
3447
# ----------------------------------------------------------------------
3548
# Constants
3649

@@ -302,20 +315,48 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
302315
return parsed, parsed, reso
303316

304317

305-
cpdef bint _does_string_look_like_datetime(object date_string):
306-
if date_string.startswith('0'):
307-
# Strings starting with 0 are more consistent with a
308-
# date-like string than a number
309-
return True
318+
cpdef bint _does_string_look_like_datetime(object py_string):
319+
"""
320+
Checks whether given string is a datetime: it has to start with '0' or
321+
be greater than 1000.
310322
311-
try:
312-
if float(date_string) < 1000:
313-
return False
314-
except ValueError:
315-
pass
323+
Parameters
324+
----------
325+
py_string: object
316326
317-
if date_string in _not_datelike_strings:
318-
return False
327+
Returns
328+
-------
329+
whether given string is a datetime
330+
"""
331+
cdef:
332+
const char *buf
333+
char *endptr = NULL
334+
Py_ssize_t length = -1
335+
double converted_date
336+
char first
337+
int error = 0
338+
339+
buf = get_c_string_buf_and_size(py_string, &length)
340+
if length >= 1:
341+
first = buf[0]
342+
if first == b'0':
343+
# Strings starting with 0 are more consistent with a
344+
# date-like string than a number
345+
return True
346+
elif py_string in _not_datelike_strings:
347+
return False
348+
else:
349+
# xstrtod with such paramaters copies behavior of python `float`
350+
# cast; for example, " 35.e-1 " is valid string for this cast so,
351+
# for correctly xstrtod call necessary to pass these params:
352+
# b'.' - a dot is used as separator, b'e' - an exponential form of
353+
# a float number can be used, b'\0' - not to use a thousand
354+
# separator, 1 - skip extra spaces before and after,
355+
converted_date = xstrtod(buf, &endptr,
356+
b'.', b'e', b'\0', 1, &error, NULL)
357+
# if there were no errors and the whole line was parsed, then ...
358+
if error == 0 and endptr == buf + length:
359+
return converted_date >= 1000
319360

320361
return True
321362

@@ -857,3 +898,117 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse,
857898
return guessed_format
858899
else:
859900
return None
901+
902+
903+
@cython.wraparound(False)
904+
@cython.boundscheck(False)
905+
cdef inline object convert_to_unicode(object item,
906+
bint keep_trivial_numbers):
907+
"""
908+
Convert `item` to str.
909+
910+
Parameters
911+
----------
912+
item : object
913+
keep_trivial_numbers : bool
914+
if True, then conversion (to string from integer/float zero)
915+
is not performed
916+
917+
Returns
918+
-------
919+
str or int or float
920+
"""
921+
cdef:
922+
float64_t float_item
923+
924+
if keep_trivial_numbers:
925+
if isinstance(item, int):
926+
if <int>item == 0:
927+
return item
928+
elif isinstance(item, float):
929+
float_item = item
930+
if float_item == 0.0 or float_item != float_item:
931+
return item
932+
933+
if not isinstance(item, str):
934+
item = PyObject_Str(item)
935+
936+
return item
937+
938+
939+
@cython.wraparound(False)
940+
@cython.boundscheck(False)
941+
def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True):
942+
"""
943+
Concatenates elements from numpy arrays in `date_cols` into strings.
944+
945+
Parameters
946+
----------
947+
date_cols : tuple of numpy arrays
948+
keep_trivial_numbers : bool, default True
949+
if True and len(date_cols) == 1, then
950+
conversion (to string from integer/float zero) is not performed
951+
952+
Returns
953+
-------
954+
arr_of_rows : ndarray (dtype=object)
955+
956+
Examples
957+
--------
958+
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
959+
>>> times=np.array(['11:20', '10:45'], dtype=object)
960+
>>> result = _concat_date_cols((dates, times))
961+
>>> result
962+
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
963+
"""
964+
cdef:
965+
Py_ssize_t rows_count = 0, col_count = len(date_cols)
966+
Py_ssize_t col_idx, row_idx
967+
list list_to_join
968+
cnp.ndarray[object] iters
969+
object[::1] iters_view
970+
flatiter it
971+
cnp.ndarray[object] result
972+
object[:] result_view
973+
974+
if col_count == 0:
975+
return np.zeros(0, dtype=object)
976+
977+
if not all(is_array(array) for array in date_cols):
978+
raise ValueError("not all elements from date_cols are numpy arrays")
979+
980+
rows_count = min(len(array) for array in date_cols)
981+
result = np.zeros(rows_count, dtype=object)
982+
result_view = result
983+
984+
if col_count == 1:
985+
array = date_cols[0]
986+
it = <flatiter>PyArray_IterNew(array)
987+
for row_idx in range(rows_count):
988+
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
989+
result_view[row_idx] = convert_to_unicode(item,
990+
keep_trivial_numbers)
991+
PyArray_ITER_NEXT(it)
992+
else:
993+
# create fixed size list - more effecient memory allocation
994+
list_to_join = [None] * col_count
995+
iters = np.zeros(col_count, dtype=object)
996+
997+
# create memoryview of iters ndarray, that will contain some
998+
# flatiter's for each array in `date_cols` - more effecient indexing
999+
iters_view = iters
1000+
for col_idx, array in enumerate(date_cols):
1001+
iters_view[col_idx] = PyArray_IterNew(array)
1002+
1003+
# array elements that are on the same line are converted to one string
1004+
for row_idx in range(rows_count):
1005+
for col_idx, array in enumerate(date_cols):
1006+
# this cast is needed, because we did not find a way
1007+
# to efficiently store `flatiter` type objects in ndarray
1008+
it = <flatiter>iters_view[col_idx]
1009+
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
1010+
list_to_join[col_idx] = convert_to_unicode(item, False)
1011+
PyArray_ITER_NEXT(it)
1012+
result_view[row_idx] = PyUnicode_Join(' ', list_to_join)
1013+
1014+
return result

pandas/io/parsers.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3186,7 +3186,7 @@ def _make_date_converter(date_parser=None, dayfirst=False,
31863186
infer_datetime_format=False, cache_dates=True):
31873187
def converter(*date_cols):
31883188
if date_parser is None:
3189-
strs = _concat_date_cols(date_cols)
3189+
strs = parsing._concat_date_cols(date_cols)
31903190

31913191
try:
31923192
return tools.to_datetime(
@@ -3216,10 +3216,10 @@ def converter(*date_cols):
32163216
except Exception:
32173217
try:
32183218
return tools.to_datetime(
3219-
parsing.try_parse_dates(_concat_date_cols(date_cols),
3220-
parser=date_parser,
3221-
dayfirst=dayfirst),
3222-
cache=cache_dates,
3219+
parsing.try_parse_dates(
3220+
parsing._concat_date_cols(date_cols),
3221+
parser=date_parser,
3222+
dayfirst=dayfirst),
32233223
errors='ignore')
32243224
except Exception:
32253225
return generic_parser(date_parser, *date_cols)
@@ -3511,15 +3511,6 @@ def _get_col_names(colspec, columns):
35113511
return colnames
35123512

35133513

3514-
def _concat_date_cols(date_cols):
3515-
if len(date_cols) == 1:
3516-
return np.array([str(x) for x in date_cols[0]], dtype=object)
3517-
3518-
rs = np.array([' '.join(str(y) for y in x)
3519-
for x in zip(*date_cols)], dtype=object)
3520-
return rs
3521-
3522-
35233514
class FixedWidthReader(BaseIterator):
35243515
"""
35253516
A reader of fixed-width lines.

0 commit comments

Comments
 (0)