33
33
#include "unicode_table_cp936.h"
34
34
#include "unicode_table_gb18030.h"
35
35
36
+ static int mbfl_filt_conv_gb18030_wchar_flush (mbfl_convert_filter * filter );
37
+
36
38
static const char * mbfl_encoding_gb18030_aliases [] = {"gb-18030" , "gb-18030-2000" , NULL };
37
39
38
40
const mbfl_encoding mbfl_encoding_gb18030 = {
@@ -52,7 +54,7 @@ const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
52
54
mbfl_filt_conv_common_ctor ,
53
55
NULL ,
54
56
mbfl_filt_conv_gb18030_wchar ,
55
- mbfl_filt_conv_common_flush ,
57
+ mbfl_filt_conv_gb18030_wchar_flush ,
56
58
NULL ,
57
59
};
58
60
@@ -104,38 +106,29 @@ int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n)
104
106
return -1 ;
105
107
}
106
108
107
- /*
108
- * GB18030 => wchar
109
- */
110
- int
111
- mbfl_filt_conv_gb18030_wchar (int c , mbfl_convert_filter * filter )
109
+ int mbfl_filt_conv_gb18030_wchar (int c , mbfl_convert_filter * filter )
112
110
{
113
111
int k ;
114
112
int c1 , c2 , c3 , w = -1 ;
115
113
116
114
switch (filter -> status ) {
117
115
case 0 :
118
- if (c >= 0 && c < 0x80 ) { /* latin */
116
+ if (c >= 0 && c < 0x80 ) { /* latin */
119
117
CK ((* filter -> output_function )(c , filter -> data ));
120
- } else if (c == 0x80 ) { /* euro sign */
121
- CK ((* filter -> output_function )(0x20ac , filter -> data ));
122
- } else if (c == 0xff ) {
123
- CK ((* filter -> output_function )(0x00ff , filter -> data ));
124
- } else if (c > 0x80 && c < 0xff ) { /* dbcs/qbcs lead byte */
118
+ } else if (c > 0x80 && c < 0xff ) { /* dbcs/qbcs lead byte */
125
119
filter -> status = 1 ;
126
120
filter -> cache = c ;
127
121
} else {
128
- w = c & MBFL_WCSGROUP_MASK ;
129
- w |= MBFL_WCSGROUP_THROUGH ;
130
- CK ((* filter -> output_function )(w , filter -> data ));
122
+ CK ((* filter -> output_function )(c | MBFL_WCSGROUP_THROUGH , filter -> data ));
131
123
}
132
124
break ;
133
125
134
- case 1 : /* dbcs/qbcs second byte */
126
+ case 1 : /* dbcs/qbcs second byte */
135
127
c1 = filter -> cache ;
136
128
filter -> status = 0 ;
137
129
138
- if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39 ) { /* 4 byte range: Unicode BMP */
130
+ if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39 ) {
131
+ /* 4 byte range: Unicode BMP */
139
132
filter -> status = 2 ;
140
133
filter -> cache = (c1 << 8 ) | c ;
141
134
return c ;
@@ -144,8 +137,8 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
144
137
filter -> status = 2 ;
145
138
filter -> cache = (c1 << 8 ) | c ;
146
139
return c ;
147
- } else if (((c1 >= 0xaa && c1 <= 0xaf ) || (c1 >= 0xf8 && c1 <= 0xfe )) &&
148
- ( c >= 0xa1 && c <= 0xfe )) { /* UDA part1 ,2: U+E000-U+E4C5 */
140
+ } else if (((c1 >= 0xaa && c1 <= 0xaf ) || (c1 >= 0xf8 && c1 <= 0xfe )) && ( c >= 0xa1 && c <= 0xfe )) {
141
+ /* UDA part 1 ,2: U+E000-U+E4C5 */
149
142
w = 94 * (c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa ) + (c - 0xa1 ) + 0xe000 ;
150
143
CK ((* filter -> output_function )(w , filter -> data ));
151
144
} else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f ) {
@@ -161,10 +154,8 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
161
154
(c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814 - 0xe810 )) ||
162
155
(c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864 - 0xe844 )))) {
163
156
for (k = 0 ; k < mbfl_gb18030_pua_tbl_max ; k ++ ) {
164
- if (c2 >= mbfl_gb18030_pua_tbl [k ][2 ] &&
165
- c2 <= mbfl_gb18030_pua_tbl [k ][2 ] + mbfl_gb18030_pua_tbl [k ][1 ]
166
- - mbfl_gb18030_pua_tbl [k ][0 ]) {
167
- w = c2 - mbfl_gb18030_pua_tbl [k ][2 ] + mbfl_gb18030_pua_tbl [k ][0 ];
157
+ if (c2 >= mbfl_gb18030_pua_tbl [k ][2 ] && c2 <= mbfl_gb18030_pua_tbl [k ][2 ] + mbfl_gb18030_pua_tbl [k ][1 ] - mbfl_gb18030_pua_tbl [k ][0 ]) {
158
+ w = c2 - mbfl_gb18030_pua_tbl [k ][2 ] + mbfl_gb18030_pua_tbl [k ][0 ];
168
159
CK ((* filter -> output_function )(w , filter -> data ));
169
160
break ;
170
161
}
@@ -184,34 +175,25 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
184
175
w = 0 ;
185
176
}
186
177
if (w <= 0 ) {
187
- w = (c1 << 8 ) | c ;
188
- w &= MBFL_WCSPLANE_MASK ;
189
- w |= MBFL_WCSPLANE_GB18030 ;
178
+ w = (c1 << 8 ) | c | MBFL_WCSPLANE_GB18030 ;
190
179
}
191
180
CK ((* filter -> output_function )(w , filter -> data ));
192
- } else if ((c >= 0 && c < 0x21 ) || c == 0x7f ) { /* CTLs */
193
- CK ((* filter -> output_function )(c , filter -> data ));
194
181
} else {
195
- w = (c1 << 8 ) | c ;
196
- w &= MBFL_WCSGROUP_MASK ;
197
- w |= MBFL_WCSGROUP_THROUGH ;
182
+ w = (c1 << 8 ) | c | MBFL_WCSGROUP_THROUGH ;
198
183
CK ((* filter -> output_function )(w , filter -> data ));
199
184
}
200
185
}
201
186
break ;
187
+
202
188
case 2 : /* qbcs third byte */
203
189
c1 = (filter -> cache >> 8 ) & 0xff ;
204
190
c2 = filter -> cache & 0xff ;
205
- filter -> status = 0 ;
206
- filter -> cache = 0 ;
207
- if (((c1 >= 0x81 && c1 <= 0x84 ) || (c1 >= 0x90 && c1 <= 0xe3 )) &&
208
- c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe ) {
191
+ filter -> status = filter -> cache = 0 ;
192
+ if (((c1 >= 0x81 && c1 <= 0x84 ) || (c1 >= 0x90 && c1 <= 0xe3 )) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe ) {
209
193
filter -> cache = (c1 << 16 ) | (c2 << 8 ) | c ;
210
194
filter -> status = 3 ;
211
195
} else {
212
- w = (c1 << 16 ) | (c2 << 8 ) | c ;
213
- w &= MBFL_WCSGROUP_MASK ;
214
- w |= MBFL_WCSGROUP_THROUGH ;
196
+ w = (c1 << 16 ) | (c2 << 8 ) | c | MBFL_WCSGROUP_THROUGH ;
215
197
CK ((* filter -> output_function )(w , filter -> data ));
216
198
}
217
199
break ;
@@ -220,27 +202,30 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
220
202
c1 = (filter -> cache >> 16 ) & 0xff ;
221
203
c2 = (filter -> cache >> 8 ) & 0xff ;
222
204
c3 = filter -> cache & 0xff ;
223
- filter -> status = 0 ;
224
- filter -> cache = 0 ;
225
- if (((c1 >= 0x81 && c1 <= 0x84 ) || (c1 >= 0x90 && c1 <= 0xe3 )) &&
226
- c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39 ) {
205
+ filter -> status = filter -> cache = 0 ;
206
+ if (((c1 >= 0x81 && c1 <= 0x84 ) || (c1 >= 0x90 && c1 <= 0xe3 )) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39 ) {
227
207
if (c1 >= 0x90 && c1 <= 0xe3 ) {
228
208
w = ((((c1 - 0x90 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 )))* 10 + (c - 0x30 ) + 0x10000 ;
209
+ if (w > 0x10FFFF ) {
210
+ w = ((c1 & 0x7F ) << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
211
+ CK ((* filter -> output_function )(w | MBFL_WCSGROUP_THROUGH , filter -> data ));
212
+ return c ;
213
+ }
229
214
} else { /* Unicode BMP */
230
215
w = (((c1 - 0x81 )* 10 + (c2 - 0x30 ))* 126 + (c3 - 0x81 ))* 10 + (c - 0x30 );
231
216
if (w >= 0 && w <= 39419 ) {
232
217
k = mbfl_bisec_srch (w , mbfl_gb2uni_tbl , mbfl_gb_uni_max );
233
- if (k < 0 ) {
218
+ if (k < 0 ) {
234
219
/* error */
235
- w = (c1 << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
220
+ w = (( c1 & 0x7F ) << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
236
221
w &= MBFL_WCSGROUP_MASK ;
237
222
w |= MBFL_WCSGROUP_THROUGH ;
238
223
CK ((* filter -> output_function )(w , filter -> data ));
239
224
return c ;
240
225
}
241
226
w += mbfl_gb_uni_ofst [k ];
242
227
} else {
243
- w = (c1 << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
228
+ w = (( c1 & 0x7F ) << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
244
229
w &= MBFL_WCSGROUP_MASK ;
245
230
w |= MBFL_WCSGROUP_THROUGH ;
246
231
CK ((* filter -> output_function )(w , filter -> data ));
@@ -249,7 +234,7 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
249
234
}
250
235
CK ((* filter -> output_function )(w , filter -> data ));
251
236
} else {
252
- w = (c1 << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
237
+ w = (( c1 & 0x7F ) << 24 ) | (c2 << 16 ) | (c3 << 8 ) | c ;
253
238
w &= MBFL_WCSGROUP_MASK ;
254
239
w |= MBFL_WCSGROUP_THROUGH ;
255
240
CK ((* filter -> output_function )(w , filter -> data ));
@@ -264,19 +249,37 @@ mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter)
264
249
return c ;
265
250
}
266
251
267
- /*
268
- * wchar => GB18030
269
- */
270
- int
271
- mbfl_filt_conv_wchar_gb18030 (int c , mbfl_convert_filter * filter )
252
+ static int mbfl_filt_conv_gb18030_wchar_flush (mbfl_convert_filter * filter )
253
+ {
254
+ if (filter -> status ) {
255
+ /* multi-byte character was truncated */
256
+ CK ((* filter -> output_function )(filter -> cache | MBFL_WCSGROUP_THROUGH , filter -> data ));
257
+ }
258
+
259
+ if (filter -> flush_function ) {
260
+ (* filter -> flush_function )(filter -> data );
261
+ }
262
+
263
+ return 0 ;
264
+ }
265
+
266
+ int mbfl_filt_conv_wchar_gb18030 (int c , mbfl_convert_filter * filter )
272
267
{
273
268
int k , k1 , k2 ;
274
269
int c1 , s = 0 , s1 = 0 ;
275
270
276
271
if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max ) {
277
- s = ucs_a1_cp936_table [c - ucs_a1_cp936_table_min ];
272
+ if (c == 0x01f9 ) {
273
+ s = 0xa8bf ;
274
+ } else {
275
+ s = ucs_a1_cp936_table [c - ucs_a1_cp936_table_min ];
276
+ }
278
277
} else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max ) {
279
- s = ucs_a2_cp936_table [c - ucs_a2_cp936_table_min ];
278
+ if (c == 0x20ac ) { /* euro-sign */
279
+ s = 0xa2e3 ;
280
+ } else {
281
+ s = ucs_a2_cp936_table [c - ucs_a2_cp936_table_min ];
282
+ }
280
283
} else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max ) {
281
284
s = ucs_a3_cp936_table [c - ucs_a3_cp936_table_min ];
282
285
} else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max ) {
@@ -315,12 +318,9 @@ mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
315
318
}
316
319
}
317
320
318
- if (c == 0x20ac ) { /* euro-sign */
319
- s = 0xa2e3 ;
320
- }
321
-
322
- if (s <= 0 && c >= mbfl_gb18030_c_tbl_key [0 ] &&
323
- c <= mbfl_gb18030_c_tbl_key [mbfl_gb18030_c_tbl_max - 1 ]) {
321
+ /* While GB18030 and CP936 are very similar, some mappings are different between these encodings;
322
+ * do a binary search in a table of differing codepoints to see if we have one */
323
+ if (s <= 0 && c >= mbfl_gb18030_c_tbl_key [0 ] && c <= mbfl_gb18030_c_tbl_key [mbfl_gb18030_c_tbl_max - 1 ]) {
324
324
k1 = mbfl_bisec_srch2 (c , mbfl_gb18030_c_tbl_key , mbfl_gb18030_c_tbl_max );
325
325
if (k1 >= 0 ) {
326
326
s = mbfl_gb18030_c_tbl_val [k1 ];
@@ -331,16 +331,19 @@ mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
331
331
if (c < 0xe766 ) {
332
332
if (c < 0xe4c6 ) {
333
333
c1 = c - 0xe000 ;
334
- s = (c1 % 94 ) + 0xa1 ; c1 /= 94 ;
334
+ s = (c1 % 94 ) + 0xa1 ;
335
+ c1 /= 94 ;
335
336
s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2 ) << 8 ;
336
337
} else {
337
338
c1 = c - 0xe4c6 ;
338
- s = ((c1 / 96 ) + 0xa1 ) << 8 ; c1 %= 96 ;
339
+ s = ((c1 / 96 ) + 0xa1 ) << 8 ;
340
+ c1 %= 96 ;
339
341
s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40 );
340
342
}
341
343
} else {
342
344
/* U+E766..U+E864 */
343
- k1 = 0 ; k2 = mbfl_gb18030_pua_tbl_max ;
345
+ k1 = 0 ;
346
+ k2 = mbfl_gb18030_pua_tbl_max ;
344
347
while (k1 < k2 ) {
345
348
k = (k1 + k2 ) >> 1 ;
346
349
if (c < mbfl_gb18030_pua_tbl [k ][0 ]) {
@@ -355,36 +358,42 @@ mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter)
355
358
}
356
359
}
357
360
358
- if (s <= 0 && c >= 0x0080 && c <= 0xffff ) { /* BMP */
361
+ /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */
362
+ if (s <= 0 && c >= 0x0080 && c <= 0xffff ) {
363
+ /* BMP */
359
364
s = mbfl_bisec_srch (c , mbfl_uni2gb_tbl , mbfl_gb_uni_max );
360
365
if (s >= 0 ) {
361
366
c1 = c - mbfl_gb_uni_ofst [s ];
362
- s = (c1 % 10 ) + 0x30 ; c1 /= 10 ;
363
- s |= ((c1 % 126 ) + 0x81 ) << 8 ; c1 /= 126 ;
364
- s |= ((c1 % 10 ) + 0x30 ) << 16 ; c1 /= 10 ;
367
+ s = (c1 % 10 ) + 0x30 ;
368
+ c1 /= 10 ;
369
+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
370
+ c1 /= 126 ;
371
+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
372
+ c1 /= 10 ;
365
373
s1 = c1 + 0x81 ;
366
374
}
367
- } else if (c >= 0x10000 && c <= 0x10ffff ) { /* Code set 3: Unicode U+10000..U+10FFFF */
375
+ } else if (c >= 0x10000 && c <= 0x10ffff ) {
376
+ /* Code set 3: Unicode U+10000..U+10FFFF */
368
377
c1 = c - 0x10000 ;
369
- s = (c1 % 10 ) + 0x30 ; c1 /= 10 ;
370
- s |= ((c1 % 126 ) + 0x81 ) << 8 ; c1 /= 126 ;
371
- s |= ((c1 % 10 ) + 0x30 ) << 16 ; c1 /= 10 ;
378
+ s = (c1 % 10 ) + 0x30 ;
379
+ c1 /= 10 ;
380
+ s |= ((c1 % 126 ) + 0x81 ) << 8 ;
381
+ c1 /= 126 ;
382
+ s |= ((c1 % 10 ) + 0x30 ) << 16 ;
383
+ c1 /= 10 ;
372
384
s1 = c1 + 0x90 ;
373
385
}
374
386
375
387
if (s <= 0 ) {
376
- c1 = c & ~MBFL_WCSPLANE_MASK ;
377
- if (c1 == MBFL_WCSPLANE_WINCP936 ) {
378
- s = c & MBFL_WCSPLANE_MASK ;
379
- }
380
388
if (c == 0 ) {
381
389
s = 0 ;
382
- } else if ( s <= 0 ) {
390
+ } else {
383
391
s = -1 ;
384
392
}
385
393
}
394
+
386
395
if (s >= 0 ) {
387
- if (s <= 0x80 ) { /* latin */
396
+ if (s <= 0x80 ) { /* latin */
388
397
CK ((* filter -> output_function )(s , filter -> data ));
389
398
} else if (s1 > 0 ) { /* qbcs */
390
399
CK ((* filter -> output_function )(s1 & 0xff , filter -> data ));
0 commit comments