Skip to content

Commit 2bb0ed5

Browse files
authored
Merge branch 'main' into standard-library-imports-style-check
2 parents 7597832 + 99859e4 commit 2bb0ed5

19 files changed

+335
-112
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ repos:
6363
'--extensions=c,h',
6464
'--headers=h',
6565
--recursive,
66-
'--filter=-readability/casting,-runtime/int,-build/include_subdir'
66+
--linelength=88,
67+
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
6768
]
6869
- repo: https://github.com/PyCQA/flake8
6970
rev: 6.0.0

doc/source/user_guide/io.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3833,7 +3833,7 @@ OpenDocument Spreadsheets
38333833
The io methods for `Excel files`_ also support reading and writing OpenDocument spreadsheets
38343834
using the `odfpy <https://pypi.org/project/odfpy/>`__ module. The semantics and features for reading and writing
38353835
OpenDocument spreadsheets match what can be done for `Excel files`_ using
3836-
``engine='odf'``.
3836+
``engine='odf'``. The optional dependency 'odfpy' needs to be installed.
38373837

38383838
The :func:`~pandas.read_excel` method can read OpenDocument spreadsheets
38393839

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Other enhancements
103103
- :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`)
104104
- Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`)
105105
- Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`)
106+
- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`)
106107
-
107108

108109
.. ---------------------------------------------------------------------------

pandas/_libs/tslibs/np_datetime.pxd

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,9 @@ cdef int64_t convert_reso(
120120
NPY_DATETIMEUNIT to_reso,
121121
bint round_ok,
122122
) except? -1
123+
124+
cdef extern from "src/datetime/np_datetime_strings.h":
125+
ctypedef enum FormatRequirement:
126+
PARTIAL_MATCH
127+
EXACT_MATCH
128+
INFER_FORMAT

pandas/_libs/tslibs/np_datetime.pyx

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
5353
npy_datetimestruct *out,
5454
NPY_DATETIMEUNIT *out_bestunit,
5555
int *out_local, int *out_tzoffset,
56-
const char *format, int format_len, int exact)
56+
const char *format, int format_len,
57+
FormatRequirement exact)
5758

5859

5960
# ----------------------------------------------------------------------
@@ -286,17 +287,20 @@ cdef int string_to_dts(
286287
const char* buf
287288
Py_ssize_t format_length
288289
const char* format_buf
290+
FormatRequirement format_requirement
289291

290292
buf = get_c_string_buf_and_size(val, &length)
291293
if format is None:
292294
format_buf = b""
293295
format_length = 0
294-
exact = False
296+
format_requirement = INFER_FORMAT
295297
else:
296298
format_buf = get_c_string_buf_and_size(format, &format_length)
299+
format_requirement = <FormatRequirement>exact
297300
return parse_iso_8601_datetime(buf, length, want_exc,
298301
dts, out_bestunit, out_local, out_tzoffset,
299-
format_buf, format_length, exact)
302+
format_buf, format_length,
303+
format_requirement)
300304

301305

302306
cpdef ndarray astype_overflowsafe(

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

Lines changed: 108 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -67,49 +67,62 @@ This file implements string parsing and creation for NumPy datetime.
6767
* Returns 0 on success, -1 on failure.
6868
*/
6969

70+
typedef enum {
71+
COMPARISON_SUCCESS,
72+
COMPLETED_PARTIAL_MATCH,
73+
COMPARISON_ERROR
74+
} DatetimePartParseResult;
7075
// This function will advance the pointer on format
7176
// and decrement characters_remaining by n on success
72-
// On failure will return -1 without incrementing
73-
static int compare_format(const char **format, int *characters_remaining,
74-
const char *compare_to, int n, const int exact) {
77+
// On failure will return COMPARISON_ERROR without incrementing
78+
// If `format_requirement` is PARTIAL_MATCH, and the `format` string has
79+
// been exhausted, then return COMPLETED_PARTIAL_MATCH.
80+
static DatetimePartParseResult compare_format(
81+
const char **format,
82+
int *characters_remaining,
83+
const char *compare_to,
84+
int n,
85+
const FormatRequirement format_requirement
86+
) {
87+
if (format_requirement == INFER_FORMAT) {
88+
return COMPARISON_SUCCESS;
89+
}
90+
if (*characters_remaining < 0) {
91+
return COMPARISON_ERROR;
92+
}
93+
if (format_requirement == PARTIAL_MATCH && *characters_remaining == 0) {
94+
return COMPLETED_PARTIAL_MATCH;
95+
}
7596
if (*characters_remaining < n) {
76-
if (exact) {
77-
// TODO(pandas-dev): in the future we should set a PyErr here
78-
// to be very clear about what went wrong
79-
return -1;
80-
} else if (*characters_remaining) {
81-
// TODO(pandas-dev): same return value in this function as
82-
// above branch, but stub out a future where
83-
// we have a better error message
84-
return -1;
85-
} else {
86-
return 0;
87-
}
97+
// TODO(pandas-dev): PyErr to differentiate what went wrong
98+
return COMPARISON_ERROR;
8899
} else {
89100
if (strncmp(*format, compare_to, n)) {
90101
// TODO(pandas-dev): PyErr to differentiate what went wrong
91-
return -1;
102+
return COMPARISON_ERROR;
92103
} else {
93104
*format += n;
94105
*characters_remaining -= n;
95-
return 0;
106+
return COMPARISON_SUCCESS;
96107
}
97108
}
98-
return 0;
109+
return COMPARISON_SUCCESS;
99110
}
100111

101112
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
102113
npy_datetimestruct *out,
103114
NPY_DATETIMEUNIT *out_bestunit,
104115
int *out_local, int *out_tzoffset,
105-
const char* format, int format_len, int exact) {
116+
const char* format, int format_len,
117+
FormatRequirement format_requirement) {
106118
if (len < 0 || format_len < 0)
107119
goto parse_error;
108120
int year_leap = 0;
109121
int i, numdigits;
110122
const char *substr;
111123
int sublen;
112124
NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC;
125+
DatetimePartParseResult comparison;
113126

114127
/* If year-month-day are separated by a valid separator,
115128
* months/days without leading zeroes will be parsed
@@ -139,8 +152,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
139152
while (sublen > 0 && isspace(*substr)) {
140153
++substr;
141154
--sublen;
142-
if (compare_format(&format, &format_len, " ", 1, exact)) {
155+
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
156+
if (comparison == COMPARISON_ERROR) {
143157
goto parse_error;
158+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
159+
goto finish;
144160
}
145161
}
146162

@@ -155,8 +171,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
155171
}
156172

157173
/* PARSE THE YEAR (4 digits) */
158-
if (compare_format(&format, &format_len, "%Y", 2, exact)) {
174+
comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement);
175+
if (comparison == COMPARISON_ERROR) {
159176
goto parse_error;
177+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
178+
goto finish;
160179
}
161180

162181
out->year = 0;
@@ -202,8 +221,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
202221
++substr;
203222
--sublen;
204223

205-
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
224+
comparison = compare_format(&format, &format_len, &ymd_sep, 1,
225+
format_requirement);
226+
if (comparison == COMPARISON_ERROR) {
206227
goto parse_error;
228+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
229+
goto finish;
207230
}
208231
/* Cannot have trailing separator */
209232
if (sublen == 0 || !isdigit(*substr)) {
@@ -212,8 +235,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
212235
}
213236

214237
/* PARSE THE MONTH */
215-
if (compare_format(&format, &format_len, "%m", 2, exact)) {
238+
comparison = compare_format(&format, &format_len, "%m", 2, format_requirement);
239+
if (comparison == COMPARISON_ERROR) {
216240
goto parse_error;
241+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
242+
goto finish;
217243
}
218244
/* First digit required */
219245
out->month = (*substr - '0');
@@ -258,14 +284,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
258284
}
259285
++substr;
260286
--sublen;
261-
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
287+
comparison = compare_format(&format, &format_len, &ymd_sep, 1,
288+
format_requirement);
289+
if (comparison == COMPARISON_ERROR) {
262290
goto parse_error;
291+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
292+
goto finish;
263293
}
264294
}
265295

266296
/* PARSE THE DAY */
267-
if (compare_format(&format, &format_len, "%d", 2, exact)) {
297+
comparison = compare_format(&format, &format_len, "%d", 2, format_requirement);
298+
if (comparison == COMPARISON_ERROR) {
268299
goto parse_error;
300+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
301+
goto finish;
269302
}
270303
/* First digit required */
271304
if (!isdigit(*substr)) {
@@ -306,15 +339,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
306339
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
307340
goto parse_error;
308341
}
309-
if (compare_format(&format, &format_len, substr, 1, exact)) {
310-
goto parse_error;
311-
}
342+
comparison = compare_format(&format, &format_len, substr, 1, format_requirement);
343+
if (comparison == COMPARISON_ERROR) {
344+
goto parse_error;
345+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
346+
goto finish;
347+
}
312348
++substr;
313349
--sublen;
314350

315351
/* PARSE THE HOURS */
316-
if (compare_format(&format, &format_len, "%H", 2, exact)) {
352+
comparison = compare_format(&format, &format_len, "%H", 2, format_requirement);
353+
if (comparison == COMPARISON_ERROR) {
317354
goto parse_error;
355+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
356+
goto finish;
318357
}
319358
/* First digit required */
320359
if (!isdigit(*substr)) {
@@ -359,8 +398,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
359398
if (sublen == 0 || !isdigit(*substr)) {
360399
goto parse_error;
361400
}
362-
if (compare_format(&format, &format_len, ":", 1, exact)) {
401+
comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
402+
if (comparison == COMPARISON_ERROR) {
363403
goto parse_error;
404+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
405+
goto finish;
364406
}
365407
} else if (!isdigit(*substr)) {
366408
if (!hour_was_2_digits) {
@@ -370,8 +412,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
370412
}
371413

372414
/* PARSE THE MINUTES */
373-
if (compare_format(&format, &format_len, "%M", 2, exact)) {
415+
comparison = compare_format(&format, &format_len, "%M", 2, format_requirement);
416+
if (comparison == COMPARISON_ERROR) {
374417
goto parse_error;
418+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
419+
goto finish;
375420
}
376421
/* First digit required */
377422
out->min = (*substr - '0');
@@ -405,8 +450,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
405450
/* If we make it through this condition block, then the next
406451
* character is a digit. */
407452
if (has_hms_sep && *substr == ':') {
408-
if (compare_format(&format, &format_len, ":", 1, exact)) {
453+
comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
454+
if (comparison == COMPARISON_ERROR) {
409455
goto parse_error;
456+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
457+
goto finish;
410458
}
411459
++substr;
412460
--sublen;
@@ -420,8 +468,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
420468
}
421469

422470
/* PARSE THE SECONDS */
423-
if (compare_format(&format, &format_len, "%S", 2, exact)) {
471+
comparison = compare_format(&format, &format_len, "%S", 2, format_requirement);
472+
if (comparison == COMPARISON_ERROR) {
424473
goto parse_error;
474+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
475+
goto finish;
425476
}
426477
/* First digit required */
427478
out->sec = (*substr - '0');
@@ -448,17 +499,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
448499
if (sublen > 0 && *substr == '.') {
449500
++substr;
450501
--sublen;
451-
if (compare_format(&format, &format_len, ".", 1, exact)) {
502+
comparison = compare_format(&format, &format_len, ".", 1, format_requirement);
503+
if (comparison == COMPARISON_ERROR) {
452504
goto parse_error;
505+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
506+
goto finish;
453507
}
454508
} else {
455509
bestunit = NPY_FR_s;
456510
goto parse_timezone;
457511
}
458512

459513
/* PARSE THE MICROSECONDS (0 to 6 digits) */
460-
if (compare_format(&format, &format_len, "%f", 2, exact)) {
514+
comparison = compare_format(&format, &format_len, "%f", 2, format_requirement);
515+
if (comparison == COMPARISON_ERROR) {
461516
goto parse_error;
517+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
518+
goto finish;
462519
}
463520
numdigits = 0;
464521
for (i = 0; i < 6; ++i) {
@@ -524,8 +581,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
524581
while (sublen > 0 && isspace(*substr)) {
525582
++substr;
526583
--sublen;
527-
if (compare_format(&format, &format_len, " ", 1, exact)) {
584+
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
585+
if (comparison == COMPARISON_ERROR) {
528586
goto parse_error;
587+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
588+
goto finish;
529589
}
530590
}
531591

@@ -539,8 +599,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
539599

540600
/* UTC specifier */
541601
if (*substr == 'Z') {
542-
if (compare_format(&format, &format_len, "%z", 2, exact)) {
602+
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
603+
if (comparison == COMPARISON_ERROR) {
543604
goto parse_error;
605+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
606+
goto finish;
544607
}
545608
/* "Z" should be equivalent to tz offset "+00:00" */
546609
if (out_local != NULL) {
@@ -561,8 +624,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
561624
--sublen;
562625
}
563626
} else if (*substr == '-' || *substr == '+') {
564-
if (compare_format(&format, &format_len, "%z", 2, exact)) {
627+
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
628+
if (comparison == COMPARISON_ERROR) {
565629
goto parse_error;
630+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
631+
goto finish;
566632
}
567633
/* Time zone offset */
568634
int offset_neg = 0, offset_hour = 0, offset_minute = 0;
@@ -647,8 +713,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
647713
while (sublen > 0 && isspace(*substr)) {
648714
++substr;
649715
--sublen;
650-
if (compare_format(&format, &format_len, " ", 1, exact)) {
716+
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
717+
if (comparison == COMPARISON_ERROR) {
651718
goto parse_error;
719+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
720+
goto finish;
652721
}
653722
}
654723

0 commit comments

Comments
 (0)