@@ -7,11 +7,19 @@ from io import StringIO
7
7
8
8
from libc.string cimport strchr
9
9
10
+ import cython
11
+
12
+ from cpython cimport PyObject_Str, PyUnicode_Join
13
+
10
14
from cpython.datetime cimport datetime, datetime_new, import_datetime
11
15
from cpython.version cimport PY_VERSION_HEX
12
16
import_datetime()
13
17
14
18
import numpy as np
19
+ cimport numpy as cnp
20
+ from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT,
21
+ PyArray_IterNew, flatiter, float64_t)
22
+ cnp.import_array()
15
23
16
24
# dateutil compat
17
25
from dateutil.tz import (tzoffset,
@@ -26,11 +34,16 @@ from pandas._config import get_option
26
34
27
35
from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
28
36
from pandas._libs.tslibs.nattype import nat_strings, NaT
29
- from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
37
+ from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size
30
38
31
39
cdef extern from " ../src/headers/portable.h" :
32
40
int getdigit_ascii(char c, int default) nogil
33
41
42
+ cdef extern from " ../src/parser/tokenizer.h" :
43
+ double xstrtod(const char * p, char ** q, char decimal, char sci, char tsep,
44
+ int skip_trailing, int * error, int * maybe_int)
45
+
46
+
34
47
# ----------------------------------------------------------------------
35
48
# Constants
36
49
@@ -302,20 +315,48 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
302
315
return parsed, parsed, reso
303
316
304
317
305
- cpdef bint _does_string_look_like_datetime(object date_string):
306
- if date_string.startswith(' 0' ):
307
- # Strings starting with 0 are more consistent with a
308
- # date-like string than a number
309
- return True
318
+ cpdef bint _does_string_look_like_datetime(object py_string):
319
+ """
320
+ Checks whether given string is a datetime: it has to start with '0' or
321
+ be greater than 1000.
310
322
311
- try :
312
- if float (date_string) < 1000 :
313
- return False
314
- except ValueError :
315
- pass
323
+ Parameters
324
+ ----------
325
+ py_string: object
316
326
317
- if date_string in _not_datelike_strings:
318
- return False
327
+ Returns
328
+ -------
329
+ whether given string is a datetime
330
+ """
331
+ cdef:
332
+ const char * buf
333
+ char * endptr = NULL
334
+ Py_ssize_t length = - 1
335
+ double converted_date
336
+ char first
337
+ int error = 0
338
+
339
+ buf = get_c_string_buf_and_size(py_string, & length)
340
+ if length >= 1 :
341
+ first = buf[0 ]
342
+ if first == b' 0' :
343
+ # Strings starting with 0 are more consistent with a
344
+ # date-like string than a number
345
+ return True
346
+ elif py_string in _not_datelike_strings:
347
+ return False
348
+ else :
349
+ # xstrtod with such paramaters copies behavior of python `float`
350
+ # cast; for example, " 35.e-1 " is valid string for this cast so,
351
+ # for correctly xstrtod call necessary to pass these params:
352
+ # b'.' - a dot is used as separator, b'e' - an exponential form of
353
+ # a float number can be used, b'\0' - not to use a thousand
354
+ # separator, 1 - skip extra spaces before and after,
355
+ converted_date = xstrtod(buf, & endptr,
356
+ b' .' , b' e' , b' \0' , 1 , & error, NULL )
357
+ # if there were no errors and the whole line was parsed, then ...
358
+ if error == 0 and endptr == buf + length:
359
+ return converted_date >= 1000
319
360
320
361
return True
321
362
@@ -857,3 +898,117 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse,
857
898
return guessed_format
858
899
else :
859
900
return None
901
+
902
+
903
+ @ cython.wraparound (False )
904
+ @ cython.boundscheck (False )
905
+ cdef inline object convert_to_unicode(object item,
906
+ bint keep_trivial_numbers):
907
+ """
908
+ Convert `item` to str.
909
+
910
+ Parameters
911
+ ----------
912
+ item : object
913
+ keep_trivial_numbers : bool
914
+ if True, then conversion (to string from integer/float zero)
915
+ is not performed
916
+
917
+ Returns
918
+ -------
919
+ str or int or float
920
+ """
921
+ cdef:
922
+ float64_t float_item
923
+
924
+ if keep_trivial_numbers:
925
+ if isinstance (item, int ):
926
+ if < int > item == 0 :
927
+ return item
928
+ elif isinstance (item, float ):
929
+ float_item = item
930
+ if float_item == 0.0 or float_item != float_item:
931
+ return item
932
+
933
+ if not isinstance (item, str ):
934
+ item = PyObject_Str(item)
935
+
936
+ return item
937
+
938
+
939
+ @ cython.wraparound (False )
940
+ @ cython.boundscheck (False )
941
+ def _concat_date_cols (tuple date_cols , bint keep_trivial_numbers = True ):
942
+ """
943
+ Concatenates elements from numpy arrays in `date_cols` into strings.
944
+
945
+ Parameters
946
+ ----------
947
+ date_cols : tuple of numpy arrays
948
+ keep_trivial_numbers : bool, default True
949
+ if True and len(date_cols) == 1, then
950
+ conversion (to string from integer/float zero) is not performed
951
+
952
+ Returns
953
+ -------
954
+ arr_of_rows : ndarray (dtype=object)
955
+
956
+ Examples
957
+ --------
958
+ >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
959
+ >>> times=np.array(['11:20', '10:45'], dtype=object)
960
+ >>> result = _concat_date_cols((dates, times))
961
+ >>> result
962
+ array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
963
+ """
964
+ cdef:
965
+ Py_ssize_t rows_count = 0 , col_count = len (date_cols)
966
+ Py_ssize_t col_idx, row_idx
967
+ list list_to_join
968
+ cnp.ndarray[object ] iters
969
+ object [::1 ] iters_view
970
+ flatiter it
971
+ cnp.ndarray[object ] result
972
+ object [:] result_view
973
+
974
+ if col_count == 0 :
975
+ return np.zeros(0 , dtype = object )
976
+
977
+ if not all (is_array(array) for array in date_cols):
978
+ raise ValueError (" not all elements from date_cols are numpy arrays" )
979
+
980
+ rows_count = min (len (array) for array in date_cols)
981
+ result = np.zeros(rows_count, dtype = object )
982
+ result_view = result
983
+
984
+ if col_count == 1 :
985
+ array = date_cols[0 ]
986
+ it = < flatiter> PyArray_IterNew(array)
987
+ for row_idx in range (rows_count):
988
+ item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
989
+ result_view[row_idx] = convert_to_unicode(item,
990
+ keep_trivial_numbers)
991
+ PyArray_ITER_NEXT(it)
992
+ else :
993
+ # create fixed size list - more effecient memory allocation
994
+ list_to_join = [None ] * col_count
995
+ iters = np.zeros(col_count, dtype = object )
996
+
997
+ # create memoryview of iters ndarray, that will contain some
998
+ # flatiter's for each array in `date_cols` - more effecient indexing
999
+ iters_view = iters
1000
+ for col_idx, array in enumerate (date_cols):
1001
+ iters_view[col_idx] = PyArray_IterNew(array)
1002
+
1003
+ # array elements that are on the same line are converted to one string
1004
+ for row_idx in range (rows_count):
1005
+ for col_idx, array in enumerate (date_cols):
1006
+ # this cast is needed, because we did not find a way
1007
+ # to efficiently store `flatiter` type objects in ndarray
1008
+ it = < flatiter> iters_view[col_idx]
1009
+ item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
1010
+ list_to_join[col_idx] = convert_to_unicode(item, False )
1011
+ PyArray_ITER_NEXT(it)
1012
+ result_view[row_idx] = PyUnicode_Join(' ' , list_to_join)
1013
+
1014
+ return result
0 commit comments