Skip to content

Commit 13136a5

Browse files
committed
Fix conversion of GB18030 text (and add test suite)
- Truncated multi-byte characters are treated as an error - Reject GB18030 4-byte codes which translate to (non-existent) Unicode codepoints above 0x10FFFF - Add a number of missing mappings from the GB18030 standards (These mappings are supported by iconv. I don't know why they were missing from mbstring.)
1 parent 340164b commit 13136a5

File tree

4 files changed

+24481
-94
lines changed

4 files changed

+24481
-94
lines changed

ext/mbstring/libmbfl/filters/mbfilter_gb18030.c

Lines changed: 85 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
#include "unicode_table_cp936.h"
3434
#include "unicode_table_gb18030.h"
3535

36+
static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter);
37+
3638
static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL};
3739

3840
const mbfl_encoding mbfl_encoding_gb18030 = {
@@ -52,7 +54,7 @@ const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
5254
mbfl_filt_conv_common_ctor,
5355
NULL,
5456
mbfl_filt_conv_gb18030_wchar,
55-
mbfl_filt_conv_common_flush,
57+
mbfl_filt_conv_gb18030_wchar_flush,
5658
NULL,
5759
};
5860

@@ -104,38 +106,29 @@ int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
104106
return -1;
105107
}
106108

107-
/*
108-
* GB18030 => wchar
109-
*/
110-
int
111-
mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
109+
int mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
112110
{
113111
int k;
114112
int c1, c2, c3, w = -1;
115113

116114
switch (filter->status) {
117115
case 0:
118-
if (c >= 0 && c < 0x80) { /* latin */
116+
if (c >= 0 && c < 0x80) { /* latin */
119117
CK((*filter->output_function)(c, filter->data));
120-
} else if (c == 0x80) { /* euro sign */
121-
CK((*filter->output_function)(0x20ac, filter->data));
122-
} else if (c == 0xff) {
123-
CK((*filter->output_function)(0x00ff, filter->data));
124-
} else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */
118+
} else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */
125119
filter->status = 1;
126120
filter->cache = c;
127121
} else {
128-
w = c & MBFL_WCSGROUP_MASK;
129-
w |= MBFL_WCSGROUP_THROUGH;
130-
CK((*filter->output_function)(w, filter->data));
122+
CK((*filter->output_function)(c | MBFL_WCSGROUP_THROUGH, filter->data));
131123
}
132124
break;
133125

134-
case 1: /* dbcs/qbcs second byte */
126+
case 1: /* dbcs/qbcs second byte */
135127
c1 = filter->cache;
136128
filter->status = 0;
137129

138-
if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) { /* 4 byte range: Unicode BMP */
130+
if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) {
131+
/* 4 byte range: Unicode BMP */
139132
filter->status = 2;
140133
filter->cache = (c1 << 8) | c;
141134
return c;
@@ -144,8 +137,8 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
144137
filter->status = 2;
145138
filter->cache = (c1 << 8) | c;
146139
return c;
147-
} else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) &&
148-
(c >= 0xa1 && c <= 0xfe)) { /* UDA part1,2: U+E000-U+E4C5 */
140+
} else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) {
141+
/* UDA part 1,2: U+E000-U+E4C5 */
149142
w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000;
150143
CK((*filter->output_function)(w, filter->data));
151144
} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) {
@@ -161,10 +154,8 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
161154
(c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) ||
162155
(c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)))) {
163156
for (k = 0; k < mbfl_gb18030_pua_tbl_max; k++) {
164-
if (c2 >= mbfl_gb18030_pua_tbl[k][2] &&
165-
c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1]
166-
- mbfl_gb18030_pua_tbl[k][0]) {
167-
w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
157+
if (c2 >= mbfl_gb18030_pua_tbl[k][2] && c2 <= mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][1] - mbfl_gb18030_pua_tbl[k][0]) {
158+
w = c2 - mbfl_gb18030_pua_tbl[k][2] + mbfl_gb18030_pua_tbl[k][0];
168159
CK((*filter->output_function)(w, filter->data));
169160
break;
170161
}
@@ -184,34 +175,25 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
184175
w = 0;
185176
}
186177
if (w <= 0) {
187-
w = (c1 << 8) | c;
188-
w &= MBFL_WCSPLANE_MASK;
189-
w |= MBFL_WCSPLANE_GB18030;
178+
w = (c1 << 8) | c | MBFL_WCSPLANE_GB18030;
190179
}
191180
CK((*filter->output_function)(w, filter->data));
192-
} else if ((c >= 0 && c < 0x21) || c == 0x7f) { /* CTLs */
193-
CK((*filter->output_function)(c, filter->data));
194181
} else {
195-
w = (c1 << 8) | c;
196-
w &= MBFL_WCSGROUP_MASK;
197-
w |= MBFL_WCSGROUP_THROUGH;
182+
w = (c1 << 8) | c | MBFL_WCSGROUP_THROUGH;
198183
CK((*filter->output_function)(w, filter->data));
199184
}
200185
}
201186
break;
187+
202188
case 2: /* qbcs third byte */
203189
c1 = (filter->cache >> 8) & 0xff;
204190
c2 = filter->cache & 0xff;
205-
filter->status = 0;
206-
filter->cache = 0;
207-
if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) &&
208-
c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
191+
filter->status = filter->cache = 0;
192+
if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) {
209193
filter->cache = (c1 << 16) | (c2 << 8) | c;
210194
filter->status = 3;
211195
} else {
212-
w = (c1 << 16) | (c2 << 8) | c;
213-
w &= MBFL_WCSGROUP_MASK;
214-
w |= MBFL_WCSGROUP_THROUGH;
196+
w = (c1 << 16) | (c2 << 8) | c | MBFL_WCSGROUP_THROUGH;
215197
CK((*filter->output_function)(w, filter->data));
216198
}
217199
break;
@@ -220,27 +202,30 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
220202
c1 = (filter->cache >> 16) & 0xff;
221203
c2 = (filter->cache >> 8) & 0xff;
222204
c3 = filter->cache & 0xff;
223-
filter->status = 0;
224-
filter->cache = 0;
225-
if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) &&
226-
c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
205+
filter->status = filter->cache = 0;
206+
if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) {
227207
if (c1 >= 0x90 && c1 <= 0xe3) {
228208
w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000;
209+
if (w > 0x10FFFF) {
210+
w = ((c1 & 0x7F) << 24) | (c2 << 16) | (c3 << 8) | c;
211+
CK((*filter->output_function)(w | MBFL_WCSGROUP_THROUGH, filter->data));
212+
return c;
213+
}
229214
} else { /* Unicode BMP */
230215
w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30);
231216
if (w >= 0 && w <= 39419) {
232217
k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max);
233-
if (k<0) {
218+
if (k < 0) {
234219
/* error */
235-
w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
220+
w = ((c1 & 0x7F) << 24) | (c2 << 16) | (c3 << 8) | c;
236221
w &= MBFL_WCSGROUP_MASK;
237222
w |= MBFL_WCSGROUP_THROUGH;
238223
CK((*filter->output_function)(w, filter->data));
239224
return c;
240225
}
241226
w += mbfl_gb_uni_ofst[k];
242227
} else {
243-
w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
228+
w = ((c1 & 0x7F) << 24) | (c2 << 16) | (c3 << 8) | c;
244229
w &= MBFL_WCSGROUP_MASK;
245230
w |= MBFL_WCSGROUP_THROUGH;
246231
CK((*filter->output_function)(w, filter->data));
@@ -249,7 +234,7 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
249234
}
250235
CK((*filter->output_function)(w, filter->data));
251236
} else {
252-
w = (c1 << 24) | (c2 << 16) | (c3 << 8) | c;
237+
w = ((c1 & 0x7F) << 24) | (c2 << 16) | (c3 << 8) | c;
253238
w &= MBFL_WCSGROUP_MASK;
254239
w |= MBFL_WCSGROUP_THROUGH;
255240
CK((*filter->output_function)(w, filter->data));
@@ -264,19 +249,37 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
264249
return c;
265250
}
266251

267-
/*
268-
* wchar => GB18030
269-
*/
270-
int
271-
mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
252+
static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter)
253+
{
254+
if (filter->status) {
255+
/* multi-byte character was truncated */
256+
CK((*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data));
257+
}
258+
259+
if (filter->flush_function) {
260+
(*filter->flush_function)(filter->data);
261+
}
262+
263+
return 0;
264+
}
265+
266+
int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
272267
{
273268
int k, k1, k2;
274269
int c1, s = 0, s1 = 0;
275270

276271
if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) {
277-
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
272+
if (c == 0x01f9) {
273+
s = 0xa8bf;
274+
} else {
275+
s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min];
276+
}
278277
} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) {
279-
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
278+
if (c == 0x20ac) { /* euro-sign */
279+
s = 0xa2e3;
280+
} else {
281+
s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min];
282+
}
280283
} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) {
281284
s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min];
282285
} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) {
@@ -315,12 +318,9 @@ mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
315318
}
316319
}
317320

318-
if (c == 0x20ac) { /* euro-sign */
319-
s = 0xa2e3;
320-
}
321-
322-
if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] &&
323-
c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
321+
/* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
322+
* do a binary search in a table of differing codepoints to see if we have one */
323+
if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) {
324324
k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max);
325325
if (k1 >= 0) {
326326
s = mbfl_gb18030_c_tbl_val[k1];
@@ -331,16 +331,19 @@ mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
331331
if (c < 0xe766) {
332332
if (c < 0xe4c6) {
333333
c1 = c - 0xe000;
334-
s = (c1 % 94) + 0xa1; c1 /= 94;
334+
s = (c1 % 94) + 0xa1;
335+
c1 /= 94;
335336
s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8;
336337
} else {
337338
c1 = c - 0xe4c6;
338-
s = ((c1 / 96) + 0xa1) << 8; c1 %= 96;
339+
s = ((c1 / 96) + 0xa1) << 8;
340+
c1 %= 96;
339341
s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40);
340342
}
341343
} else {
342344
/* U+E766..U+E864 */
343-
k1 = 0; k2 = mbfl_gb18030_pua_tbl_max;
345+
k1 = 0;
346+
k2 = mbfl_gb18030_pua_tbl_max;
344347
while (k1 < k2) {
345348
k = (k1 + k2) >> 1;
346349
if (c < mbfl_gb18030_pua_tbl[k][0]) {
@@ -355,36 +358,42 @@ mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
355358
}
356359
}
357360

358-
if (s <= 0 && c >= 0x0080 && c <= 0xffff) { /* BMP */
361+
/* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
362+
if (s <= 0 && c >= 0x0080 && c <= 0xffff) {
363+
/* BMP */
359364
s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max);
360365
if (s >= 0) {
361366
c1 = c - mbfl_gb_uni_ofst[s];
362-
s = (c1 % 10) + 0x30; c1 /= 10;
363-
s |= ((c1 % 126) + 0x81) << 8; c1 /= 126;
364-
s |= ((c1 % 10) + 0x30) << 16; c1 /= 10;
367+
s = (c1 % 10) + 0x30;
368+
c1 /= 10;
369+
s |= ((c1 % 126) + 0x81) << 8;
370+
c1 /= 126;
371+
s |= ((c1 % 10) + 0x30) << 16;
372+
c1 /= 10;
365373
s1 = c1 + 0x81;
366374
}
367-
} else if (c >= 0x10000 && c <= 0x10ffff) { /* Code set 3: Unicode U+10000..U+10FFFF */
375+
} else if (c >= 0x10000 && c <= 0x10ffff) {
376+
/* Code set 3: Unicode U+10000..U+10FFFF */
368377
c1 = c - 0x10000;
369-
s = (c1 % 10) + 0x30; c1 /= 10;
370-
s |= ((c1 % 126) + 0x81) << 8; c1 /= 126;
371-
s |= ((c1 % 10) + 0x30) << 16; c1 /= 10;
378+
s = (c1 % 10) + 0x30;
379+
c1 /= 10;
380+
s |= ((c1 % 126) + 0x81) << 8;
381+
c1 /= 126;
382+
s |= ((c1 % 10) + 0x30) << 16;
383+
c1 /= 10;
372384
s1 = c1 + 0x90;
373385
}
374386

375387
if (s <= 0) {
376-
c1 = c & ~MBFL_WCSPLANE_MASK;
377-
if (c1 == MBFL_WCSPLANE_WINCP936) {
378-
s = c & MBFL_WCSPLANE_MASK;
379-
}
380388
if (c == 0) {
381389
s = 0;
382-
} else if (s <= 0) {
390+
} else {
383391
s = -1;
384392
}
385393
}
394+
386395
if (s >= 0) {
387-
if (s <= 0x80) { /* latin */
396+
if (s <= 0x80) { /* latin */
388397
CK((*filter->output_function)(s, filter->data));
389398
} else if (s1 > 0) { /* qbcs */
390399
CK((*filter->output_function)(s1 & 0xff, filter->data));

ext/mbstring/libmbfl/filters/unicode_table_gb18030.h

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,31 +25,35 @@
2525
#ifndef UNICODE_TABLE_GB18030_H
2626
#define UNICODE_TABLE_GB18030_H
2727

28-
static const unsigned short mbfl_gb18030_c_tbl_val[58] =
28+
static const unsigned short mbfl_gb18030_c_tbl_val[] =
2929
{
3030
0xfe50,0xfe54,0xfe57,0xfe58,0xfe5d,0xfe5e,0xfe6b,0xfe6e,
31-
0xfe71,0xfe73,0xfe74,0xfe75,0xfe79,0xfe84,0xfe56,0xfe55,
32-
0xfe5a,0xfe5c,0xfe5b,0xfe60,0xfe5f,0xfe62,0xfe65,0xfe63,
33-
0xfe64,0xfe68,0xfe69,0xfe6a,0xfe6f,0xfe70,0xfe72,0xfe78,
34-
0xfe77,0xfe7a,0xfe7b,0xfe7d,0xfe7c,0xfe80,0xfe81,0xfe82,
35-
0xfe83,0xfe85,0xfe86,0xfe87,0xfe88,0xfe89,0xfe8a,0xfe8b,
36-
0xfe8d,0xfe8c,0xfe8f,0xfe8e,0xfe96,0xfe93,0xfe94,0xfe95,
37-
0xfe97,0xfe92,
31+
0xfe71,0xfe73,0xfe74,0xfe75,0xfe79,0xfe84,0xa98a,0xa98b,
32+
0xa98c,0xa98d,0xa98e,0xa98f,0xa990,0xa991,0xa992,0xa993,
33+
0xa994,0xa995,0xa989,0xfe56,0xfe55,0xfe5a,0xfe5c,0xfe5b,
34+
0xfe60,0xfe5f,0xfe62,0xfe65,0xfe63,0xfe64,0xfe68,0xfe69,
35+
0xfe6a,0xfe6f,0xfe70,0xfe72,0xfe78,0xfe77,0xfe7a,0xfe7b,
36+
0xfe7d,0xfe7c,0xfe80,0xfe81,0xfe82,0xfe83,0xfe85,0xfe86,
37+
0xfe87,0xfe88,0xfe89,0xfe8a,0xfe8b,0xfe8d,0xfe8c,0xfe8f,
38+
0xfe8e,0xfe96,0xfe93,0xfe94,0xfe95,0xfe97,0xfe92,0xfe98,
39+
0xfe99,0xfe9a,0xfe9b,0xfe9c,0xfe9d,0xfe9e,0xfe9f
3840
};
3941

40-
static const unsigned short mbfl_gb18030_c_tbl_key[58] =
42+
static const unsigned short mbfl_gb18030_c_tbl_key[] =
4143
{
4244
0x2e81,0x2e84,0x2e88,0x2e8b,0x2e8c,0x2e97,0x2ea7,0x2eaa,
43-
0x2eae,0x2eb3,0x2eb6,0x2eb7,0x2ebb,0x2eca,0x3447,0x3473,
44-
0x359e,0x360e,0x361a,0x3918,0x396e,0x39cf,0x39d0,0x39df,
45-
0x3a73,0x3b4e,0x3c6e,0x3ce0,0x4056,0x415f,0x4337,0x43ac,
46-
0x43b1,0x43dd,0x44d6,0x464c,0x4661,0x4723,0x4729,0x477c,
47-
0x478d,0x4947,0x497a,0x497d,0x4982,0x4983,0x4985,0x4986,
48-
0x499b,0x499f,0x49b6,0x49b7,0x4c77,0x4c9f,0x4ca0,0x4ca1,
49-
0x4ca2,0x4ca3,
45+
0x2eae,0x2eb3,0x2eb6,0x2eb7,0x2ebb,0x2eca,0x2ff0,0x2ff1,
46+
0x2ff2,0x2ff3,0x2ff4,0x2ff5,0x2ff6,0x2ff7,0x2ff8,0x2ff9,
47+
0x2ffa,0x2ffb,0x303e,0x3447,0x3473,0x359e,0x360e,0x361a,
48+
0x3918,0x396e,0x39cf,0x39d0,0x39df,0x3a73,0x3b4e,0x3c6e,
49+
0x3ce0,0x4056,0x415f,0x4337,0x43ac,0x43b1,0x43dd,0x44d6,
50+
0x464c,0x4661,0x4723,0x4729,0x477c,0x478d,0x4947,0x497a,
51+
0x497d,0x4982,0x4983,0x4985,0x4986,0x499b,0x499f,0x49b6,
52+
0x49b7,0x4c77,0x4c9f,0x4ca0,0x4ca1,0x4ca2,0x4ca3,0x4d13,
53+
0x4d14,0x4d15,0x4d16,0x4d17,0x4d18,0x4d19,0x4dae
5054
};
5155

52-
static const int mbfl_gb18030_c_tbl_max = sizeof(mbfl_gb18030_c_tbl_key)/sizeof(unsigned short);
56+
static const int mbfl_gb18030_c_tbl_max = sizeof(mbfl_gb18030_c_tbl_key) / sizeof(unsigned short);
5357

5458
static const unsigned short mbfl_gb18030_pua_tbl[][3] = {
5559
{0xe766, 0xe76b, 0xa2ab},
@@ -229,5 +233,4 @@ static const unsigned short mbfl_gb_uni_ofst[] = {
229233

230234
static const int mbfl_gb_uni_max = sizeof(mbfl_gb_uni_ofst)/sizeof(unsigned short);
231235

232-
233236
#endif /* UNICODE_TABLE_GB18030_H */

0 commit comments

Comments
 (0)