Skip to content

Commit 8f9322b

Browse files
committed
UTF-8 validate strings before interning
1 parent bdd782e commit 8f9322b

File tree

6 files changed

+62
-15
lines changed

6 files changed

+62
-15
lines changed

Zend/zend_string.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ static zend_always_inline zend_string *zend_add_interned_string(zend_string *str
180180
GC_SET_REFCOUNT(str, 1);
181181
GC_ADD_FLAGS(str, IS_STR_INTERNED | flags);
182182

183+
if (!ZSTR_IS_VALID_UTF8(str) && zend_string_validate_utf8(str)) {
184+
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
185+
}
186+
183187
ZVAL_INTERNED_STR(&val, str);
184188

185189
zend_hash_add_new(interned_strings, str, &val);
@@ -493,3 +497,45 @@ ZEND_API zend_string *zend_string_concat3(
493497

494498
return res;
495499
}
500+
501+
// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
502+
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
503+
// https://stackoverflow.com/a/22135005/1320374
504+
505+
enum {
506+
UTF8_ACCEPT = 0,
507+
UTF8_REJECT = 1,
508+
};
509+
510+
static const uint8_t utf8d[] = {
511+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
512+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
513+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
514+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
515+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
516+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
517+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
518+
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
519+
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
520+
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
521+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
522+
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
523+
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
524+
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
525+
};
526+
527+
ZEND_API bool zend_string_validate_utf8(zend_string *string) {
528+
char *str = ZSTR_VAL(string);
529+
size_t len = ZSTR_LEN(string);
530+
uint32_t state = UTF8_ACCEPT;
531+
532+
for (size_t i = 0; i < len; i++) {
533+
uint32_t type = utf8d[(uint8_t)str[i]];
534+
state = utf8d[256 + state * 16 + type];
535+
536+
if (state == UTF8_REJECT)
537+
break;
538+
}
539+
540+
return state == UTF8_ACCEPT;
541+
}

Zend/zend_string.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ ZEND_API extern zend_string *zend_empty_string;
7878
ZEND_API extern zend_string *zend_one_char_string[256];
7979
ZEND_API extern zend_string **zend_known_strings;
8080

81+
ZEND_API bool zend_string_validate_utf8(zend_string *string);
82+
8183
END_EXTERN_C()
8284

8385
/* Shortcuts */

ext/mbstring/mbstring.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,7 +1804,7 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
18041804
unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
18051805
if (char_len) {
18061806
return ZSTR_LEN(string) / char_len;
1807-
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) {
1807+
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
18081808
return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
18091809
}
18101810

@@ -2254,7 +2254,7 @@ PHP_FUNCTION(mb_substr_count)
22542254
if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
22552255
/* No need to do any conversion if haystack/needle are already known-valid UTF-8
22562256
* (If they are not valid, then not passing them through conversion filters could affect output) */
2257-
if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
2257+
if (ZSTR_IS_VALID_UTF8(haystack)) {
22582258
haystack_u8 = haystack;
22592259
} else {
22602260
unsigned int num_errors = 0;
@@ -2264,7 +2264,7 @@ PHP_FUNCTION(mb_substr_count)
22642264
}
22652265
}
22662266

2267-
if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
2267+
if (ZSTR_IS_VALID_UTF8(needle)) {
22682268
needle_u8 = needle;
22692269
} else {
22702270
unsigned int num_errors = 0;
@@ -3152,7 +3152,7 @@ PHP_FUNCTION(mb_detect_encoding)
31523152
strict = MBSTRG(strict_detection);
31533153
}
31543154

3155-
if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
3155+
if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
31563156
ret = &mbfl_encoding_utf8;
31573157
} else {
31583158
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
@@ -5172,11 +5172,13 @@ static bool mb_fast_check_utf8_avx2(zend_string *str)
51725172
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
51735173
{
51745174
if (encoding == &mbfl_encoding_utf8) {
5175-
if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
5175+
if (ZSTR_IS_VALID_UTF8(str)) {
51765176
return true;
5177+
} else if (ZSTR_IS_INTERNED(str)) {
5178+
return false;
51775179
}
51785180
bool result = mb_fast_check_utf8(str);
5179-
if (result && !ZSTR_IS_INTERNED(str)) {
5181+
if (result) {
51805182
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
51815183
}
51825184
return result;
@@ -5439,7 +5441,7 @@ PHP_FUNCTION(mb_scrub)
54395441
RETURN_THROWS();
54405442
}
54415443

5442-
if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
5444+
if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
54435445
/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
54445446
RETURN_STR_COPY(str);
54455447
}

ext/opcache/ZendAccelerator.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,9 @@ zend_string* ZEND_FASTCALL accel_new_interned_string(zend_string *str)
550550
*hash_slot = STRTAB_STR_TO_POS(&ZCSG(interned_strings), s);
551551
GC_SET_REFCOUNT(s, 2);
552552
GC_TYPE_INFO(s) = GC_STRING | ((IS_STR_INTERNED | IS_STR_PERMANENT) << GC_FLAGS_SHIFT)| (ZSTR_IS_VALID_UTF8(str) ? IS_STR_VALID_UTF8 : 0);
553+
if (!ZSTR_IS_VALID_UTF8(s) && zend_string_validate_utf8(str)) {
554+
GC_ADD_FLAGS(s, IS_STR_VALID_UTF8);
555+
}
553556
ZSTR_H(s) = h;
554557
ZSTR_LEN(s) = ZSTR_LEN(str);
555558
memcpy(ZSTR_VAL(s), ZSTR_VAL(str), ZSTR_LEN(s) + 1);

ext/pcre/php_pcre.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1125,7 +1125,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
11251125

11261126
static zend_always_inline bool is_known_valid_utf8(
11271127
zend_string *subject_str, PCRE2_SIZE start_offset) {
1128-
if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
1128+
if (!ZSTR_IS_VALID_UTF8(subject_str)) {
11291129
/* We don't know whether the string is valid UTF-8 or not. */
11301130
return 0;
11311131
}

ext/zend_test/tests/strings_marked_as_utf8.phpt

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,6 @@ $s = "f" . "o";
4747
var_dump($s);
4848
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
4949

50-
// The "foo" string matches with a "Foo" class which is registered by the zend_test extension.
51-
// That class name does not have the "valid UTF-8" flag because class names in general
52-
// don't have to be UTF-8. As the "foo" string here goes through the interning logic,
53-
// the string gets replaced by the "foo" string from the class, which does
54-
// not have the "valid UTF-8" flag. We therefore choose a different test case: "fxo".
55-
// The previous "foo" test case works because it is not interned.
5650
echo "Multiple concatenation known valid UTF-8 in assignment:\n";
5751
$s = "f" . "o" . "o";
5852
var_dump($s);
@@ -167,7 +161,7 @@ string(2) "fo"
167161
bool(true)
168162
Multiple concatenation known valid UTF-8 in assignment:
169163
string(3) "foo"
170-
bool(false)
164+
bool(true)
171165
string(3) "fxo"
172166
bool(true)
173167
Concatenation known valid UTF-8 string with empty string in variables:

0 commit comments

Comments
 (0)