Skip to content

Commit c3d7610

Browse files
committed
Split out a specialized function to decode multibyte UTF-8 sequences
Decoding purely multibyte UTF-8 is common for example in the case of JSON. Furthermore, we want to avoid the switch on the character set in such hot code. Finally, we also add UNEXPECTED markers to move code to the cold section which reduces pressure on the µop and instruction caches.
1 parent 8376904 commit c3d7610

File tree

3 files changed

+97
-62
lines changed

3 files changed

+97
-62
lines changed

UPGRADING.INTERNALS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ PHP 8.5 INTERNALS UPGRADE NOTES
6161
is still valid. This is useful when a GC cycle is collected and the
6262
database object can be destroyed prior to destroying the statement.
6363

64+
- ext/standard
65+
. Added `php_next_utf8_char_mb()` to decode the next UTF-8 multibyte
66+
codepoint (i.e. >= 2 bytes).
67+
6468
========================
6569
4. OpCode changes
6670
========================

ext/standard/html.c

Lines changed: 92 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,16 @@
5353
(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
5454
} while (0)
5555

56-
#define MB_FAILURE(pos, advance) do { \
56+
#define MB_FAILURE_NO_STATUS(pos, advance) do { \
5757
*cursor = pos + (advance); \
58-
*status = FAILURE; \
5958
return 0; \
6059
} while (0)
6160

61+
#define MB_FAILURE(pos, advance) do { \
62+
*status = FAILURE; \
63+
MB_FAILURE_NO_STATUS(pos, advance); \
64+
} while (0)
65+
6266
#define CHECK_LEN(pos, chars_need) ((str_len - (pos)) >= (chars_need))
6367

6468
/* valid as single byte character or leading byte */
@@ -85,6 +89,87 @@ static char *get_default_charset(void) {
8589
}
8690
/* }}} */
8791

92+
/* Decodes the next UTF-8 multibyte codepoint (i.e. >= 2 bytes).
93+
* Uses `c` as the leading byte. */
94+
PHPAPI unsigned int php_next_utf8_char_mb(
95+
const unsigned char *str,
96+
unsigned char c,
97+
size_t str_len,
98+
size_t *cursor)
99+
{
100+
size_t pos = *cursor;
101+
unsigned int this_char = 0;
102+
103+
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
104+
* "In a reported illegal byte sequence, do not include any
105+
* non-initial byte that encodes a valid character or is a leading
106+
* byte for a valid sequence." */
107+
108+
ZEND_ASSERT(c >= 0x80);
109+
110+
if (UNEXPECTED(c < 0xc2)) {
111+
MB_FAILURE_NO_STATUS(pos, 1);
112+
} else if (c < 0xe0) {
113+
if (UNEXPECTED(!CHECK_LEN(pos, 2)))
114+
MB_FAILURE_NO_STATUS(pos, 1);
115+
116+
if (UNEXPECTED(!utf8_trail(str[pos + 1]))) {
117+
MB_FAILURE_NO_STATUS(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
118+
}
119+
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
120+
if (UNEXPECTED(this_char < 0x80)) { /* non-shortest form */
121+
MB_FAILURE_NO_STATUS(pos, 2);
122+
}
123+
pos += 2;
124+
} else if (c < 0xf0) {
125+
size_t avail = str_len - pos;
126+
127+
if (UNEXPECTED(avail < 3 ||
128+
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]))) {
129+
if (avail < 2 || utf8_lead(str[pos + 1]))
130+
MB_FAILURE_NO_STATUS(pos, 1);
131+
else if (avail < 3 || utf8_lead(str[pos + 2]))
132+
MB_FAILURE_NO_STATUS(pos, 2);
133+
else
134+
MB_FAILURE_NO_STATUS(pos, 3);
135+
}
136+
137+
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
138+
if (UNEXPECTED(this_char < 0x800)) { /* non-shortest form */
139+
MB_FAILURE_NO_STATUS(pos, 3);
140+
} else if (UNEXPECTED(this_char >= 0xd800 && this_char <= 0xdfff)) { /* surrogate */
141+
MB_FAILURE_NO_STATUS(pos, 3);
142+
}
143+
pos += 3;
144+
} else if (c < 0xf5) {
145+
size_t avail = str_len - pos;
146+
147+
if (UNEXPECTED(avail < 4 ||
148+
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
149+
!utf8_trail(str[pos + 3]))) {
150+
if (avail < 2 || utf8_lead(str[pos + 1]))
151+
MB_FAILURE_NO_STATUS(pos, 1);
152+
else if (avail < 3 || utf8_lead(str[pos + 2]))
153+
MB_FAILURE_NO_STATUS(pos, 2);
154+
else if (avail < 4 || utf8_lead(str[pos + 3]))
155+
MB_FAILURE_NO_STATUS(pos, 3);
156+
else
157+
MB_FAILURE_NO_STATUS(pos, 4);
158+
}
159+
160+
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
161+
if (UNEXPECTED(this_char < 0x10000 || this_char > 0x10FFFF)) { /* non-shortest form or outside range */
162+
MB_FAILURE_NO_STATUS(pos, 4);
163+
}
164+
pos += 4;
165+
} else {
166+
MB_FAILURE_NO_STATUS(pos, 1);
167+
}
168+
169+
*cursor = pos;
170+
return this_char;
171+
}
172+
88173
/* {{{ get_next_char */
89174
static inline unsigned int get_next_char(
90175
enum entity_charset charset,
@@ -105,72 +190,17 @@ static inline unsigned int get_next_char(
105190
switch (charset) {
106191
case cs_utf_8:
107192
{
108-
/* We'll follow strategy 2. from section 3.6.1 of UTR #36:
109-
* "In a reported illegal byte sequence, do not include any
110-
* non-initial byte that encodes a valid character or is a leading
111-
* byte for a valid sequence." */
112193
unsigned char c;
113194
c = str[pos];
114195
if (c < 0x80) {
115196
this_char = c;
116197
pos++;
117-
} else if (c < 0xc2) {
118-
MB_FAILURE(pos, 1);
119-
} else if (c < 0xe0) {
120-
if (!CHECK_LEN(pos, 2))
121-
MB_FAILURE(pos, 1);
122-
123-
if (!utf8_trail(str[pos + 1])) {
124-
MB_FAILURE(pos, utf8_lead(str[pos + 1]) ? 1 : 2);
125-
}
126-
this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f);
127-
if (this_char < 0x80) { /* non-shortest form */
128-
MB_FAILURE(pos, 2);
129-
}
130-
pos += 2;
131-
} else if (c < 0xf0) {
132-
size_t avail = str_len - pos;
133-
134-
if (avail < 3 ||
135-
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2])) {
136-
if (avail < 2 || utf8_lead(str[pos + 1]))
137-
MB_FAILURE(pos, 1);
138-
else if (avail < 3 || utf8_lead(str[pos + 2]))
139-
MB_FAILURE(pos, 2);
140-
else
141-
MB_FAILURE(pos, 3);
142-
}
143-
144-
this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f);
145-
if (this_char < 0x800) { /* non-shortest form */
146-
MB_FAILURE(pos, 3);
147-
} else if (this_char >= 0xd800 && this_char <= 0xdfff) { /* surrogate */
148-
MB_FAILURE(pos, 3);
149-
}
150-
pos += 3;
151-
} else if (c < 0xf5) {
152-
size_t avail = str_len - pos;
153-
154-
if (avail < 4 ||
155-
!utf8_trail(str[pos + 1]) || !utf8_trail(str[pos + 2]) ||
156-
!utf8_trail(str[pos + 3])) {
157-
if (avail < 2 || utf8_lead(str[pos + 1]))
158-
MB_FAILURE(pos, 1);
159-
else if (avail < 3 || utf8_lead(str[pos + 2]))
160-
MB_FAILURE(pos, 2);
161-
else if (avail < 4 || utf8_lead(str[pos + 3]))
162-
MB_FAILURE(pos, 3);
163-
else
164-
MB_FAILURE(pos, 4);
165-
}
166-
167-
this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f);
168-
if (this_char < 0x10000 || this_char > 0x10FFFF) { /* non-shortest form or outside range */
169-
MB_FAILURE(pos, 4);
170-
}
171-
pos += 4;
172198
} else {
173-
MB_FAILURE(pos, 1);
199+
this_char = php_next_utf8_char_mb(str, c, str_len, cursor);
200+
if (UNEXPECTED(this_char == 0)) {
201+
*status = FAILURE;
202+
}
203+
return this_char;
174204
}
175205
}
176206
break;

ext/standard/html.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,5 +48,6 @@ PHPAPI zend_string *php_escape_html_entities(const unsigned char *old, size_t ol
4848
PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t oldlen, int all, int flags, const char *hint_charset, bool double_encode, bool quiet);
4949
PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int flags, const char *hint_charset);
5050
PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, zend_result *status);
51+
PHPAPI unsigned int php_next_utf8_char_mb(const unsigned char *str, unsigned char c, size_t str_len, size_t *cursor);
5152

5253
#endif /* HTML_H */

0 commit comments

Comments
 (0)