53
53
(all) = (all) && !CHARSET_PARTIAL_SUPPORT((charset)) && ((doctype) != ENT_HTML_DOC_XML1); \
54
54
} while (0)
55
55
56
- #define MB_FAILURE (pos , advance ) do { \
56
+ #define MB_FAILURE_NO_STATUS (pos , advance ) do { \
57
57
*cursor = pos + (advance); \
58
- *status = FAILURE; \
59
58
return 0; \
60
59
} while (0)
61
60
61
+ #define MB_FAILURE (pos , advance ) do { \
62
+ *status = FAILURE; \
63
+ MB_FAILURE_NO_STATUS(pos, advance); \
64
+ } while (0)
65
+
62
66
#define CHECK_LEN (pos , chars_need ) ((str_len - (pos)) >= (chars_need))
63
67
64
68
/* valid as single byte character or leading byte */
@@ -85,6 +89,87 @@ static char *get_default_charset(void) {
85
89
}
86
90
/* }}} */
87
91
92
+ /* Decodes the next UTF-8 multibyte codepoint (i.e. >= 2 bytes).
93
+ * Uses `c` as the leading byte. */
94
+ PHPAPI unsigned int php_next_utf8_char_mb (
95
+ const unsigned char * str ,
96
+ unsigned char c ,
97
+ size_t str_len ,
98
+ size_t * cursor )
99
+ {
100
+ size_t pos = * cursor ;
101
+ unsigned int this_char = 0 ;
102
+
103
+ /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
104
+ * "In a reported illegal byte sequence, do not include any
105
+ * non-initial byte that encodes a valid character or is a leading
106
+ * byte for a valid sequence." */
107
+
108
+ ZEND_ASSERT (c >= 0x80 );
109
+
110
+ if (UNEXPECTED (c < 0xc2 )) {
111
+ MB_FAILURE_NO_STATUS (pos , 1 );
112
+ } else if (c < 0xe0 ) {
113
+ if (UNEXPECTED (!CHECK_LEN (pos , 2 )))
114
+ MB_FAILURE_NO_STATUS (pos , 1 );
115
+
116
+ if (UNEXPECTED (!utf8_trail (str [pos + 1 ]))) {
117
+ MB_FAILURE_NO_STATUS (pos , utf8_lead (str [pos + 1 ]) ? 1 : 2 );
118
+ }
119
+ this_char = ((c & 0x1f ) << 6 ) | (str [pos + 1 ] & 0x3f );
120
+ if (UNEXPECTED (this_char < 0x80 )) { /* non-shortest form */
121
+ MB_FAILURE_NO_STATUS (pos , 2 );
122
+ }
123
+ pos += 2 ;
124
+ } else if (c < 0xf0 ) {
125
+ size_t avail = str_len - pos ;
126
+
127
+ if (UNEXPECTED (avail < 3 ||
128
+ !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ]))) {
129
+ if (avail < 2 || utf8_lead (str [pos + 1 ]))
130
+ MB_FAILURE_NO_STATUS (pos , 1 );
131
+ else if (avail < 3 || utf8_lead (str [pos + 2 ]))
132
+ MB_FAILURE_NO_STATUS (pos , 2 );
133
+ else
134
+ MB_FAILURE_NO_STATUS (pos , 3 );
135
+ }
136
+
137
+ this_char = ((c & 0x0f ) << 12 ) | ((str [pos + 1 ] & 0x3f ) << 6 ) | (str [pos + 2 ] & 0x3f );
138
+ if (UNEXPECTED (this_char < 0x800 )) { /* non-shortest form */
139
+ MB_FAILURE_NO_STATUS (pos , 3 );
140
+ } else if (UNEXPECTED (this_char >= 0xd800 && this_char <= 0xdfff )) { /* surrogate */
141
+ MB_FAILURE_NO_STATUS (pos , 3 );
142
+ }
143
+ pos += 3 ;
144
+ } else if (c < 0xf5 ) {
145
+ size_t avail = str_len - pos ;
146
+
147
+ if (UNEXPECTED (avail < 4 ||
148
+ !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ]) ||
149
+ !utf8_trail (str [pos + 3 ]))) {
150
+ if (avail < 2 || utf8_lead (str [pos + 1 ]))
151
+ MB_FAILURE_NO_STATUS (pos , 1 );
152
+ else if (avail < 3 || utf8_lead (str [pos + 2 ]))
153
+ MB_FAILURE_NO_STATUS (pos , 2 );
154
+ else if (avail < 4 || utf8_lead (str [pos + 3 ]))
155
+ MB_FAILURE_NO_STATUS (pos , 3 );
156
+ else
157
+ MB_FAILURE_NO_STATUS (pos , 4 );
158
+ }
159
+
160
+ this_char = ((c & 0x07 ) << 18 ) | ((str [pos + 1 ] & 0x3f ) << 12 ) | ((str [pos + 2 ] & 0x3f ) << 6 ) | (str [pos + 3 ] & 0x3f );
161
+ if (UNEXPECTED (this_char < 0x10000 || this_char > 0x10FFFF )) { /* non-shortest form or outside range */
162
+ MB_FAILURE_NO_STATUS (pos , 4 );
163
+ }
164
+ pos += 4 ;
165
+ } else {
166
+ MB_FAILURE_NO_STATUS (pos , 1 );
167
+ }
168
+
169
+ * cursor = pos ;
170
+ return this_char ;
171
+ }
172
+
88
173
/* {{{ get_next_char */
89
174
static inline unsigned int get_next_char (
90
175
enum entity_charset charset ,
@@ -105,72 +190,17 @@ static inline unsigned int get_next_char(
105
190
switch (charset ) {
106
191
case cs_utf_8 :
107
192
{
108
- /* We'll follow strategy 2. from section 3.6.1 of UTR #36:
109
- * "In a reported illegal byte sequence, do not include any
110
- * non-initial byte that encodes a valid character or is a leading
111
- * byte for a valid sequence." */
112
193
unsigned char c ;
113
194
c = str [pos ];
114
195
if (c < 0x80 ) {
115
196
this_char = c ;
116
197
pos ++ ;
117
- } else if (c < 0xc2 ) {
118
- MB_FAILURE (pos , 1 );
119
- } else if (c < 0xe0 ) {
120
- if (!CHECK_LEN (pos , 2 ))
121
- MB_FAILURE (pos , 1 );
122
-
123
- if (!utf8_trail (str [pos + 1 ])) {
124
- MB_FAILURE (pos , utf8_lead (str [pos + 1 ]) ? 1 : 2 );
125
- }
126
- this_char = ((c & 0x1f ) << 6 ) | (str [pos + 1 ] & 0x3f );
127
- if (this_char < 0x80 ) { /* non-shortest form */
128
- MB_FAILURE (pos , 2 );
129
- }
130
- pos += 2 ;
131
- } else if (c < 0xf0 ) {
132
- size_t avail = str_len - pos ;
133
-
134
- if (avail < 3 ||
135
- !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ])) {
136
- if (avail < 2 || utf8_lead (str [pos + 1 ]))
137
- MB_FAILURE (pos , 1 );
138
- else if (avail < 3 || utf8_lead (str [pos + 2 ]))
139
- MB_FAILURE (pos , 2 );
140
- else
141
- MB_FAILURE (pos , 3 );
142
- }
143
-
144
- this_char = ((c & 0x0f ) << 12 ) | ((str [pos + 1 ] & 0x3f ) << 6 ) | (str [pos + 2 ] & 0x3f );
145
- if (this_char < 0x800 ) { /* non-shortest form */
146
- MB_FAILURE (pos , 3 );
147
- } else if (this_char >= 0xd800 && this_char <= 0xdfff ) { /* surrogate */
148
- MB_FAILURE (pos , 3 );
149
- }
150
- pos += 3 ;
151
- } else if (c < 0xf5 ) {
152
- size_t avail = str_len - pos ;
153
-
154
- if (avail < 4 ||
155
- !utf8_trail (str [pos + 1 ]) || !utf8_trail (str [pos + 2 ]) ||
156
- !utf8_trail (str [pos + 3 ])) {
157
- if (avail < 2 || utf8_lead (str [pos + 1 ]))
158
- MB_FAILURE (pos , 1 );
159
- else if (avail < 3 || utf8_lead (str [pos + 2 ]))
160
- MB_FAILURE (pos , 2 );
161
- else if (avail < 4 || utf8_lead (str [pos + 3 ]))
162
- MB_FAILURE (pos , 3 );
163
- else
164
- MB_FAILURE (pos , 4 );
165
- }
166
-
167
- this_char = ((c & 0x07 ) << 18 ) | ((str [pos + 1 ] & 0x3f ) << 12 ) | ((str [pos + 2 ] & 0x3f ) << 6 ) | (str [pos + 3 ] & 0x3f );
168
- if (this_char < 0x10000 || this_char > 0x10FFFF ) { /* non-shortest form or outside range */
169
- MB_FAILURE (pos , 4 );
170
- }
171
- pos += 4 ;
172
198
} else {
173
- MB_FAILURE (pos , 1 );
199
+ this_char = php_next_utf8_char_mb (str , c , str_len , cursor );
200
+ if (UNEXPECTED (this_char == 0 )) {
201
+ * status = FAILURE ;
202
+ }
203
+ return this_char ;
174
204
}
175
205
}
176
206
break ;
0 commit comments