php · iluuu1994 · Mar 17, 2023 · bukka · Mar 25, 2023 · mvorisek
@@ -180,6 +180,10 @@ static zend_always_inline zend_string *zend_add_interned_string(zend_string *str
 	GC_SET_REFCOUNT(str, 1);
 	GC_ADD_FLAGS(str, IS_STR_INTERNED | flags);
 
+	if (!ZSTR_IS_VALID_UTF8(str) && zend_string_validate_utf8(str)) {
+		GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
+	}
+
 	ZVAL_INTERNED_STR(&val, str);
 
 	zend_hash_add_new(interned_strings, str, &val);
@@ -493,3 +497,45 @@ ZEND_API zend_string *zend_string_concat3(
 
 	return res;
 }
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+// https://stackoverflow.com/a/22135005/1320374
+
+enum {
+    UTF8_ACCEPT = 0,
+    UTF8_REJECT = 1,
+};
+
+static const uint8_t utf8d[] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+ZEND_API bool zend_string_validate_utf8(zend_string *string) {
-ZEND_API bool zend_string_validate_utf8(zend_string *string) {
+ZEND_API bool zend_string_validate_utf8(const zend_string *string) {
-ZEND_API bool zend_string_validate_utf8(zend_string *string) {
+ZEND_API bool zend_string_validate_utf8(const zend_string *string) {
+    char *str = ZSTR_VAL(string);
+    size_t len = ZSTR_LEN(string);
+    uint32_t state = UTF8_ACCEPT;
+
+    for (size_t i = 0; i < len; i++) {
+        uint32_t type = utf8d[(uint8_t)str[i]];
+        state = utf8d[256 + state * 16 + type];
+
+        if (state == UTF8_REJECT)
+            break;
-        if (state == UTF8_REJECT)
-            break;
+        if (state == UTF8_REJECT) {
+            break;
+        }
-        if (state == UTF8_REJECT)
-            break;
+        if (state == UTF8_REJECT) {
+            break;
+        }
+    }
+
+    return state == UTF8_ACCEPT;
+}
@@ -78,6 +78,8 @@ ZEND_API extern zend_string  *zend_empty_string;
 ZEND_API extern zend_string  *zend_one_char_string[256];
 ZEND_API extern zend_string **zend_known_strings;
 
+ZEND_API bool zend_string_validate_utf8(zend_string *string);
+
 END_EXTERN_C()
 
 /* Shortcuts */

@@ -1804,7 +1804,7 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
 	unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
 	if (char_len) {
 		return ZSTR_LEN(string) / char_len;
-	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) {
+	} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
 	}
 
@@ -2254,7 +2254,7 @@ PHP_FUNCTION(mb_substr_count)
 	if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
 		/* No need to do any conversion if haystack/needle are already known-valid UTF-8
 		 * (If they are not valid, then not passing them through conversion filters could affect output) */
-		if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
+		if (ZSTR_IS_VALID_UTF8(haystack)) {
 			haystack_u8 = haystack;
 		} else {
 			unsigned int num_errors = 0;
@@ -2264,7 +2264,7 @@ PHP_FUNCTION(mb_substr_count)
 			}
 		}
 
-		if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
+		if (ZSTR_IS_VALID_UTF8(needle)) {
 			needle_u8 = needle;
 		} else {
 			unsigned int num_errors = 0;
@@ -3152,7 +3152,7 @@ PHP_FUNCTION(mb_detect_encoding)
 		strict = MBSTRG(strict_detection);
 	}
 
-	if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
+	if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
 		ret = &mbfl_encoding_utf8;
 	} else {
 		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
@@ -5172,11 +5172,13 @@ static bool mb_fast_check_utf8_avx2(zend_string *str)
 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
 {
 	if (encoding == &mbfl_encoding_utf8) {
-		if (GC_FLAGS(str) & IS_STR_VALID_UTF8) {
+		if (ZSTR_IS_VALID_UTF8(str)) {
 			return true;
+		} else if (ZSTR_IS_INTERNED(str)) {
+			return false;
 		}
 		bool result = mb_fast_check_utf8(str);
-		if (result && !ZSTR_IS_INTERNED(str)) {
+		if (result) {
 			GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
 		}
 		return result;
@@ -5439,7 +5441,7 @@ PHP_FUNCTION(mb_scrub)
 		RETURN_THROWS();
 	}
 
-	if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
+	if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
 		/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
 		RETURN_STR_COPY(str);
 	}

@@ -550,6 +550,9 @@ zend_string* ZEND_FASTCALL accel_new_interned_string(zend_string *str)
 	*hash_slot = STRTAB_STR_TO_POS(&ZCSG(interned_strings), s);
 	GC_SET_REFCOUNT(s, 2);
 	GC_TYPE_INFO(s) = GC_STRING | ((IS_STR_INTERNED | IS_STR_PERMANENT) << GC_FLAGS_SHIFT)| (ZSTR_IS_VALID_UTF8(str) ? IS_STR_VALID_UTF8 : 0);
+	if (!ZSTR_IS_VALID_UTF8(s) && zend_string_validate_utf8(str)) {
+		GC_ADD_FLAGS(s, IS_STR_VALID_UTF8);
+	}
 	ZSTR_H(s) = h;
 	ZSTR_LEN(s) = ZSTR_LEN(str);
 	memcpy(ZSTR_VAL(s), ZSTR_VAL(str), ZSTR_LEN(s) + 1);

diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c
@@ -1125,7 +1125,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
 
 static zend_always_inline bool is_known_valid_utf8(
 		zend_string *subject_str, PCRE2_SIZE start_offset) {
-	if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
+	if (!ZSTR_IS_VALID_UTF8(subject_str)) {
 		/* We don't know whether the string is valid UTF-8 or not. */
 		return 0;
 	}

diff --git a/ext/zend_test/tests/strings_marked_as_utf8.phpt b/ext/zend_test/tests/strings_marked_as_utf8.phpt
@@ -47,12 +47,6 @@ $s = "f" . "o";
 var_dump($s);
 var_dump(zend_test_is_string_marked_as_valid_utf8($s));
 
-// The "foo" string matches with a "Foo" class which is registered by the zend_test extension.
-// That class name does not have the "valid UTF-8" flag because class names in general
-// don't have to be UTF-8. As the "foo" string here goes through the interning logic,
-// the string gets replaced by the "foo" string from the class, which does
-// not have the "valid UTF-8" flag. We therefore choose a different test case: "fxo".
-// The previous "foo" test case works because it is not interned.
 echo "Multiple concatenation known valid UTF-8 in assignment:\n";
 $s = "f" . "o" . "o";
 var_dump($s);
@@ -167,7 +161,7 @@ string(2) "fo"
 bool(true)
 Multiple concatenation known valid UTF-8 in assignment:
 string(3) "foo"
-bool(false)
+bool(true)
 string(3) "fxo"
 bool(true)
 Concatenation known valid UTF-8 string with empty string in variables: