Skip to content

Commit 2b9acd3

Browse files
committed
Fixed bug #72685
We currently have a large performance problem when implementing lexers working on UTF-8 strings in PHP. This kind of code tends to perform a large number of matches at different offsets on a single string. This is generally fast. However, if /u mode is used, the full string will be UTF-8 validated on each match. This results in quadratic runtime. This patch fixes the issue by adding a IS_STR_VALID_UTF8 flag, which is set when we have determined that the string is valid UTF8 and further validation is skipped. A limitation of this approach is that we can't set the flag for interned strings. I think this is not a problem for this use-case which will generally work on dynamic data. If we want to use this flag for other purposes as well (mbstring?) then it might be worthwhile to UTF-8 validate strings during interning. But right now this doesn't seem useful.
1 parent 8c9d8c3 commit 2b9acd3

File tree

6 files changed

+38
-5
lines changed

6 files changed

+38
-5
lines changed

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ PHP NEWS
6161
. openssl_random_pseudo_bytes() now throws in error conditions.
6262
(Sammy Kaye Powers)
6363

64+
- PCRE:
65+
. Fixed bug #72685 (Repeated UTF-8 validation of same string in UTF-8 mode).
66+
(Nikita)
67+
6468
- PDO_OCI:
6569
. Support Oracle Database tracing attributes ACTION, MODULE,
6670
CLIENT_INFO, and CLIENT_IDENTIFIER. (Cameron Porter)

UPGRADING

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,3 +415,8 @@ The following extensions are affected:
415415
which improves performance of this function if it can be statically
416416
resolved. In namespaced code, this may require writing \array_key_exists()
417417
or explicitly importing the function.
418+
419+
- PCRE:
420+
. When preg_match() in UTF-8 mode ("u" modifier) is repeatedly called on the
421+
same string (but possibly different offsets), it will only be checked for
422+
UTF-8 validity once.

Zend/zend_string.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ END_EXTERN_C()
7979
(str) = (zend_string *)do_alloca(ZEND_MM_ALIGNED_SIZE_EX(_ZSTR_STRUCT_SIZE(_len), 8), (use_heap)); \
8080
GC_SET_REFCOUNT(str, 1); \
8181
GC_TYPE_INFO(str) = IS_STRING; \
82-
zend_string_forget_hash_val(str); \
82+
ZSTR_H(str) = 0; \
8383
ZSTR_LEN(str) = _len; \
8484
} while (0)
8585

@@ -101,6 +101,7 @@ static zend_always_inline zend_ulong zend_string_hash_val(zend_string *s)
101101
static zend_always_inline void zend_string_forget_hash_val(zend_string *s)
102102
{
103103
ZSTR_H(s) = 0;
104+
GC_DEL_FLAGS(s, IS_STR_VALID_UTF8);
104105
}
105106

106107
static zend_always_inline uint32_t zend_string_refcount(const zend_string *s)
@@ -133,7 +134,7 @@ static zend_always_inline zend_string *zend_string_alloc(size_t len, int persist
133134

134135
GC_SET_REFCOUNT(ret, 1);
135136
GC_TYPE_INFO(ret) = IS_STRING | ((persistent ? IS_STR_PERSISTENT : 0) << GC_FLAGS_SHIFT);
136-
zend_string_forget_hash_val(ret);
137+
ZSTR_H(ret) = 0;
137138
ZSTR_LEN(ret) = len;
138139
return ret;
139140
}
@@ -144,7 +145,7 @@ static zend_always_inline zend_string *zend_string_safe_alloc(size_t n, size_t m
144145

145146
GC_SET_REFCOUNT(ret, 1);
146147
GC_TYPE_INFO(ret) = IS_STRING | ((persistent ? IS_STR_PERSISTENT : 0) << GC_FLAGS_SHIFT);
147-
zend_string_forget_hash_val(ret);
148+
ZSTR_H(ret) = 0;
148149
ZSTR_LEN(ret) = (n * m) + l;
149150
return ret;
150151
}

Zend/zend_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,7 @@ static zend_always_inline uint32_t zval_gc_info(uint32_t gc_type_info) {
579579
#define IS_STR_INTERNED GC_IMMUTABLE /* interned string */
580580
#define IS_STR_PERSISTENT GC_PERSISTENT /* allocated using malloc */
581581
#define IS_STR_PERMANENT (1<<8) /* relives request boundary */
582+
#define IS_STR_VALID_UTF8 (1<<9) /* valid UTF-8 according to PCRE */
582583

583584
/* array flags */
584585
#define IS_ARRAY_IMMUTABLE GC_IMMUTABLE

ext/pcre/php_pcre.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,8 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
11041104
}
11051105
}
11061106

1107-
options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1107+
options = (pce->compile_options & PCRE2_UTF) && !(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)
1108+
? 0 : PCRE2_NO_UTF_CHECK;
11081109

11091110
/* Execute the regular expression. */
11101111
#ifdef HAVE_PCRE_JIT_SUPPORT
@@ -1403,8 +1404,12 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
14031404
efree(subpat_names);
14041405
}
14051406

1406-
/* Did we encounter an error? */
14071407
if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1408+
/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1409+
if ((pce->compile_options & PCRE2_UTF) && !ZSTR_IS_INTERNED(subject_str)) {
1410+
GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1411+
}
1412+
14081413
RETVAL_LONG(matched);
14091414
} else {
14101415
RETVAL_FALSE;

ext/pcre/tests/bug72685.phpt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
--TEST--
2+
Bug #72685: Same string is UTF-8 validated repeatedly
3+
--FILE--
4+
<?php
5+
6+
$input_size = 64 * 1024;
7+
$str = str_repeat('a', $input_size);
8+
9+
$start = microtime(true);
10+
$pos = 0;
11+
while (preg_match('/\G\w/u', $str, $m, 0, $pos)) ++$pos;
12+
$end = microtime(true);
13+
var_dump(($end - $start) < 0.5); // large margin, more like 0.05 in debug build
14+
15+
?>
16+
--EXPECT--
17+
bool(true)

0 commit comments

Comments
 (0)