Skip to content

UTF-8 validate strings before interning #10870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions Zend/zend_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ static zend_always_inline zend_string *zend_add_interned_string(zend_string *str
GC_SET_REFCOUNT(str, 1);
GC_ADD_FLAGS(str, IS_STR_INTERNED | flags);

if (!ZSTR_IS_VALID_UTF8(str) && zend_string_validate_utf8(str)) {
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
}

ZVAL_INTERNED_STR(&val, str);

zend_hash_add_new(interned_strings, str, &val);
Expand Down Expand Up @@ -493,3 +497,45 @@ ZEND_API zend_string *zend_string_concat3(

return res;
}

// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// https://stackoverflow.com/a/22135005/1320374
Comment on lines +501 to +503
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is under BSD licence so it should be pointed here as well or ideally new header created so it doesn't mix licenses in the single file. I have been actually using this in jsond for some time and I separated the headers for that.


enum {
UTF8_ACCEPT = 0,
UTF8_REJECT = 1,
};

static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

ZEND_API bool zend_string_validate_utf8(zend_string *string) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't seem to modify the pointer

Suggested change
ZEND_API bool zend_string_validate_utf8(zend_string *string) {
ZEND_API bool zend_string_validate_utf8(const zend_string *string) {

char *str = ZSTR_VAL(string);
size_t len = ZSTR_LEN(string);
uint32_t state = UTF8_ACCEPT;

for (size_t i = 0; i < len; i++) {
uint32_t type = utf8d[(uint8_t)str[i]];
state = utf8d[256 + state * 16 + type];

if (state == UTF8_REJECT)
break;
Comment on lines +536 to +537
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CS Nit:

Suggested change
if (state == UTF8_REJECT)
break;
if (state == UTF8_REJECT) {
break;
}

}

return state == UTF8_ACCEPT;
}
2 changes: 2 additions & 0 deletions Zend/zend_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ ZEND_API extern zend_string *zend_empty_string;
ZEND_API extern zend_string *zend_one_char_string[256];
ZEND_API extern zend_string **zend_known_strings;

ZEND_API bool zend_string_validate_utf8(zend_string *string);

END_EXTERN_C()

/* Shortcuts */
Expand Down
16 changes: 9 additions & 7 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -1804,7 +1804,7 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
if (char_len) {
return ZSTR_LEN(string) / char_len;
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) {
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
}

Expand Down Expand Up @@ -2254,7 +2254,7 @@ PHP_FUNCTION(mb_substr_count)
if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
/* No need to do any conversion if haystack/needle are already known-valid UTF-8
* (If they are not valid, then not passing them through conversion filters could affect output) */
if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
if (ZSTR_IS_VALID_UTF8(haystack)) {
haystack_u8 = haystack;
} else {
unsigned int num_errors = 0;
Expand All @@ -2264,7 +2264,7 @@ PHP_FUNCTION(mb_substr_count)
}
}

if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
if (ZSTR_IS_VALID_UTF8(needle)) {
needle_u8 = needle;
} else {
unsigned int num_errors = 0;
Expand Down Expand Up @@ -3152,7 +3152,7 @@ PHP_FUNCTION(mb_detect_encoding)
strict = MBSTRG(strict_detection);
}

if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
ret = &mbfl_encoding_utf8;
} else {
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
Expand Down Expand Up @@ -5172,11 +5172,13 @@ static bool mb_fast_check_utf8_avx2(zend_string *str)
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
{
if (encoding == &mbfl_encoding_utf8) {
if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
if (ZSTR_IS_VALID_UTF8(str)) {
return true;
} else if (ZSTR_IS_INTERNED(str)) {
return false;
}
bool result = mb_fast_check_utf8(str);
if (result && !ZSTR_IS_INTERNED(str)) {
if (result) {
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
}
return result;
Expand Down Expand Up @@ -5439,7 +5441,7 @@ PHP_FUNCTION(mb_scrub)
RETURN_THROWS();
}

if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
RETURN_STR_COPY(str);
}
Expand Down
3 changes: 3 additions & 0 deletions ext/opcache/ZendAccelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,9 @@ zend_string* ZEND_FASTCALL accel_new_interned_string(zend_string *str)
*hash_slot = STRTAB_STR_TO_POS(&ZCSG(interned_strings), s);
GC_SET_REFCOUNT(s, 2);
GC_TYPE_INFO(s) = GC_STRING | ((IS_STR_INTERNED | IS_STR_PERMANENT) << GC_FLAGS_SHIFT)| (ZSTR_IS_VALID_UTF8(str) ? IS_STR_VALID_UTF8 : 0);
if (!ZSTR_IS_VALID_UTF8(s) && zend_string_validate_utf8(str)) {
GC_ADD_FLAGS(s, IS_STR_VALID_UTF8);
}
ZSTR_H(s) = h;
ZSTR_LEN(s) = ZSTR_LEN(str);
memcpy(ZSTR_VAL(s), ZSTR_VAL(str), ZSTR_LEN(s) + 1);
Expand Down
2 changes: 1 addition & 1 deletion ext/pcre/php_pcre.c
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *

static zend_always_inline bool is_known_valid_utf8(
zend_string *subject_str, PCRE2_SIZE start_offset) {
if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
if (!ZSTR_IS_VALID_UTF8(subject_str)) {
/* We don't know whether the string is valid UTF-8 or not. */
return 0;
}
Expand Down
8 changes: 1 addition & 7 deletions ext/zend_test/tests/strings_marked_as_utf8.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@ $s = "f" . "o";
var_dump($s);
var_dump(zend_test_is_string_marked_as_valid_utf8($s));

// The "foo" string matches with a "Foo" class which is registered by the zend_test extension.
// That class name does not have the "valid UTF-8" flag because class names in general
// don't have to be UTF-8. As the "foo" string here goes through the interning logic,
// the string gets replaced by the "foo" string from the class, which does
// not have the "valid UTF-8" flag. We therefore choose a different test case: "fxo".
// The previous "foo" test case works because it is not interned.
echo "Multiple concatenation known valid UTF-8 in assignment:\n";
$s = "f" . "o" . "o";
var_dump($s);
Expand Down Expand Up @@ -167,7 +161,7 @@ string(2) "fo"
bool(true)
Multiple concatenation known valid UTF-8 in assignment:
string(3) "foo"
bool(false)
bool(true)
string(3) "fxo"
bool(true)
Concatenation known valid UTF-8 string with empty string in variables:
Expand Down