Skip to content

Commit 2f52956

Browse files
committed
Optimize stripos/stristr
Closes GH-7847 Closes GH-7852 Previously stripos/stristr would lowercase both the haystack and the needle to reuse strpos. The approach in this PR is similar to strpos. memchr is highly optimized so we're using it to search for the first character of the needle in the haystack. If we find it we compare the remaining characters of the needle manually. The new implementation seems to perform about half as well as strpos (as two memchr calls are necessary to find the next candidate).
1 parent 70f712c commit 2f52956

File tree

9 files changed

+79
-46
lines changed

9 files changed

+79
-46
lines changed

NEWS

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ PHP NEWS
2020
. net_get_interfaces() also reports wireless network interfaces on Windows.
2121
(Yurun)
2222
. Finished AVIF support in getimagesize(). (Yannis Guyon)
23+
. Fixed bug GH-7847 (stripos with large haystack has bad performance).
24+
(ilutov)
2325

2426
- Zip:
2527
. add ZipArchive::clearError() method

UPGRADING.INTERNALS

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ PHP 8.2 INTERNALS UPGRADE NOTES
1717
zend_binary_str(n)casecmp() as one would expect. Call the appropriate
1818
wrapped function directly instead.
1919
* Removed the (ZEND_)WRONG_PARAM_COUNT_WITH_RETVAL() macros.
20+
* php_stristr() no longer lowercases the haystack and needle as a side effect.
21+
Call zend_str_tolower() yourself if necessary. You no longer need to copy
22+
the haystack and needle before passing them to php_stristr().
2023

2124
========================
2225
2. Build system changes

Zend/zend_operators.h

+61
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,67 @@ static zend_always_inline void zend_unwrap_reference(zval *op) /* {{{ */
919919
}
920920
/* }}} */
921921

922+
static zend_always_inline bool zend_strnieq(const char *ptr1, const char *ptr2, size_t num)
923+
{
924+
const char *end = ptr1 + num;
925+
while (ptr1 < end) {
926+
if (zend_tolower_ascii(*ptr1++) != zend_tolower_ascii(*ptr2++)) {
927+
return 0;
928+
}
929+
}
930+
return 1;
931+
}
932+
933+
static zend_always_inline const char *
934+
zend_memnistr(const char *haystack, const char *needle, size_t needle_len, const char *end)
935+
{
936+
ZEND_ASSERT(end >= haystack);
937+
938+
if (UNEXPECTED(needle_len == 0)) {
939+
return haystack;
940+
}
941+
942+
if (UNEXPECTED(needle_len > (end - haystack))) {
943+
return NULL;
944+
}
945+
946+
const char first_lower = zend_tolower_ascii(*needle);
947+
const char first_upper = zend_toupper_ascii(*needle);
948+
const char *p_lower = (const char *)memchr(haystack, first_lower, end - haystack);
949+
const char *p_upper = NULL;
950+
if (first_lower != first_upper) {
951+
// If the needle length is 1 we don't need to look beyond p_lower as it is a guaranteed match
952+
size_t upper_search_length = end - (needle_len == 1 && p_lower != NULL ? p_lower : haystack);
953+
p_upper = (const char *)memchr(haystack, first_upper, upper_search_length);
954+
}
955+
const char *p = !p_upper || (p_lower && p_lower < p_upper) ? p_lower : p_upper;
956+
957+
if (needle_len == 1) {
958+
return p;
959+
}
960+
961+
const char needle_end_lower = zend_tolower_ascii(needle[needle_len - 1]);
962+
const char needle_end_upper = zend_toupper_ascii(needle[needle_len - 1]);
963+
end -= needle_len;
964+
965+
while (p && p <= end) {
966+
if (needle_end_lower == p[needle_len - 1] || needle_end_upper == p[needle_len - 1]) {
967+
if (zend_strnieq(needle + 1, p + 1, needle_len - 2)) {
968+
return p;
969+
}
970+
}
971+
if (p_lower == p) {
972+
p_lower = (const char *)memchr(p_lower + 1, first_lower, end - p_lower);
973+
}
974+
if (p_upper == p) {
975+
p_upper = (const char *)memchr(p_upper + 1, first_upper, end - p_upper);
976+
}
977+
p = !p_upper || (p_lower && p_lower < p_upper) ? p_lower : p_upper;
978+
}
979+
980+
return NULL;
981+
}
982+
922983

923984
END_EXTERN_C()
924985

ext/libxml/libxml.c

+2-3
Original file line numberDiff line numberDiff line change
@@ -378,9 +378,9 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc)
378378
const char buf[] = "Content-Type:";
379379
if (Z_TYPE_P(header) == IS_STRING &&
380380
!zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) {
381-
char *needle = estrdup("charset=");
381+
char needle[] = "charset=";
382382
char *haystack = estrndup(Z_STRVAL_P(header), Z_STRLEN_P(header));
383-
char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), sizeof("charset=")-1);
383+
char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), strlen(needle));
384384

385385
if (encoding) {
386386
char *end;
@@ -408,7 +408,6 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc)
408408
}
409409
}
410410
efree(haystack);
411-
efree(needle);
412411
break; /* found content-type */
413412
}
414413
} ZEND_HASH_FOREACH_END();

ext/phar/phar.c

+1-6
Original file line numberDiff line numberDiff line change
@@ -2531,7 +2531,6 @@ int phar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int conv
25312531
{
25322532
char halt_stub[] = "__HALT_COMPILER();";
25332533
zend_string *newstub;
2534-
char *tmp;
25352534
phar_entry_info *entry, *newentry;
25362535
size_t halt_offset;
25372536
int restore_alias_len, global_flags = 0, closeoldfile;
@@ -2635,9 +2634,7 @@ int phar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int conv
26352634
} else {
26362635
free_user_stub = 0;
26372636
}
2638-
tmp = estrndup(user_stub, len);
2639-
if ((pos = php_stristr(tmp, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) {
2640-
efree(tmp);
2637+
if ((pos = php_stristr(user_stub, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) {
26412638
if (closeoldfile) {
26422639
php_stream_close(oldfile);
26432640
}
@@ -2650,8 +2647,6 @@ int phar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int conv
26502647
}
26512648
return EOF;
26522649
}
2653-
pos = user_stub + (pos - tmp);
2654-
efree(tmp);
26552650
len = pos - user_stub + 18;
26562651
if ((size_t)len != php_stream_write(newfile, user_stub, len)
26572652
|| 5 != php_stream_write(newfile, " ?>\r\n", 5)) {

ext/phar/tar.c

+2-6
Original file line numberDiff line numberDiff line change
@@ -967,7 +967,7 @@ int phar_tar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int
967967
int closeoldfile, free_user_stub;
968968
size_t signature_length;
969969
struct _phar_pass_tar_info pass;
970-
char *buf, *signature, *tmp, sigbuf[8];
970+
char *buf, *signature, sigbuf[8];
971971
char halt_stub[] = "__HALT_COMPILER();";
972972

973973
entry.flags = PHAR_ENT_PERM_DEF_FILE;
@@ -1063,9 +1063,7 @@ int phar_tar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int
10631063
free_user_stub = 0;
10641064
}
10651065

1066-
tmp = estrndup(user_stub, len);
1067-
if ((pos = php_stristr(tmp, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) {
1068-
efree(tmp);
1066+
if ((pos = php_stristr(user_stub, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) {
10691067
if (error) {
10701068
spprintf(error, 0, "illegal stub for tar-based phar \"%s\"", phar->fname);
10711069
}
@@ -1074,8 +1072,6 @@ int phar_tar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int
10741072
}
10751073
return EOF;
10761074
}
1077-
pos = user_stub + (pos - tmp);
1078-
efree(tmp);
10791075

10801076
len = pos - user_stub + 18;
10811077
entry.fp = php_stream_fopen_tmpfile();

ext/phar/zip.c

+1-6
Original file line numberDiff line numberDiff line change
@@ -1202,7 +1202,6 @@ int phar_zip_flush(phar_archive_data *phar, char *user_stub, zend_long len, int
12021202
char *pos;
12031203
static const char newstub[] = "<?php // zip-based phar archive stub file\n__HALT_COMPILER();";
12041204
char halt_stub[] = "__HALT_COMPILER();";
1205-
char *tmp;
12061205

12071206
php_stream *stubfile, *oldfile;
12081207
int free_user_stub, closeoldfile = 0;
@@ -1305,9 +1304,7 @@ int phar_zip_flush(phar_archive_data *phar, char *user_stub, zend_long len, int
13051304
free_user_stub = 0;
13061305
}
13071306

1308-
tmp = estrndup(user_stub, len);
1309-
if ((pos = php_stristr(tmp, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) {
1310-
efree(tmp);
1307+
if ((pos = php_stristr(user_stub, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) {
13111308
if (error) {
13121309
spprintf(error, 0, "illegal stub for zip-based phar \"%s\"", phar->fname);
13131310
}
@@ -1316,8 +1313,6 @@ int phar_zip_flush(phar_archive_data *phar, char *user_stub, zend_long len, int
13161313
}
13171314
return EOF;
13181315
}
1319-
pos = user_stub + (pos - tmp);
1320-
efree(tmp);
13211316

13221317
len = pos - user_stub + 18;
13231318
entry.fp = php_stream_fopen_tmpfile();

ext/standard/string.c

+6-25
Original file line numberDiff line numberDiff line change
@@ -1687,9 +1687,7 @@ PHP_FUNCTION(pathinfo)
16871687
case insensitive strstr */
16881688
PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len)
16891689
{
1690-
zend_str_tolower(s, s_len);
1691-
zend_str_tolower(t, t_len);
1692-
return (char*)php_memnstr(s, t, t_len, s + s_len);
1690+
return (char*)php_memnistr(s, t, t_len, s + s_len);
16931691
}
16941692
/* }}} */
16951693

@@ -1735,8 +1733,6 @@ PHP_FUNCTION(stristr)
17351733
zend_string *haystack, *needle;
17361734
const char *found = NULL;
17371735
size_t found_offset;
1738-
char *haystack_dup;
1739-
char *orig_needle;
17401736
bool part = 0;
17411737

17421738
ZEND_PARSE_PARAMETERS_START(2, 3)
@@ -1746,13 +1742,10 @@ PHP_FUNCTION(stristr)
17461742
Z_PARAM_BOOL(part)
17471743
ZEND_PARSE_PARAMETERS_END();
17481744

1749-
haystack_dup = estrndup(ZSTR_VAL(haystack), ZSTR_LEN(haystack));
1750-
orig_needle = estrndup(ZSTR_VAL(needle), ZSTR_LEN(needle));
1751-
found = php_stristr(haystack_dup, orig_needle, ZSTR_LEN(haystack), ZSTR_LEN(needle));
1752-
efree(orig_needle);
1745+
found = php_stristr(ZSTR_VAL(haystack), ZSTR_VAL(needle), ZSTR_LEN(haystack), ZSTR_LEN(needle));
17531746

17541747
if (found) {
1755-
found_offset = found - haystack_dup;
1748+
found_offset = found - ZSTR_VAL(haystack);
17561749
if (part) {
17571750
RETVAL_STRINGL(ZSTR_VAL(haystack), found_offset);
17581751
} else {
@@ -1761,8 +1754,6 @@ PHP_FUNCTION(stristr)
17611754
} else {
17621755
RETVAL_FALSE;
17631756
}
1764-
1765-
efree(haystack_dup);
17661757
}
17671758
/* }}} */
17681759

@@ -1890,7 +1881,6 @@ PHP_FUNCTION(stripos)
18901881
const char *found = NULL;
18911882
zend_string *haystack, *needle;
18921883
zend_long offset = 0;
1893-
zend_string *needle_dup = NULL, *haystack_dup;
18941884

18951885
ZEND_PARSE_PARAMETERS_START(2, 3)
18961886
Z_PARAM_STR(haystack)
@@ -1907,23 +1897,14 @@ PHP_FUNCTION(stripos)
19071897
RETURN_THROWS();
19081898
}
19091899

1910-
if (ZSTR_LEN(needle) > ZSTR_LEN(haystack)) {
1911-
RETURN_FALSE;
1912-
}
1913-
1914-
haystack_dup = zend_string_tolower(haystack);
1915-
needle_dup = zend_string_tolower(needle);
1916-
found = (char*)php_memnstr(ZSTR_VAL(haystack_dup) + offset,
1917-
ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), ZSTR_VAL(haystack_dup) + ZSTR_LEN(haystack));
1900+
found = (char*)php_memnistr(ZSTR_VAL(haystack) + offset,
1901+
ZSTR_VAL(needle), ZSTR_LEN(needle), ZSTR_VAL(haystack) + ZSTR_LEN(haystack));
19181902

19191903
if (found) {
1920-
RETVAL_LONG(found - ZSTR_VAL(haystack_dup));
1904+
RETVAL_LONG(found - ZSTR_VAL(haystack));
19211905
} else {
19221906
RETVAL_FALSE;
19231907
}
1924-
1925-
zend_string_release_ex(haystack_dup, 0);
1926-
zend_string_release_ex(needle_dup, 0);
19271908
}
19281909
/* }}} */
19291910

main/php.h

+1
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ END_EXTERN_C()
344344
#define phpin zendin
345345

346346
#define php_memnstr zend_memnstr
347+
#define php_memnistr zend_memnistr
347348

348349
/* functions */
349350
BEGIN_EXTERN_C()

0 commit comments

Comments
 (0)