|
95 | 95 | #include "filters/mbfilter_utf8.h"
|
96 | 96 |
|
97 | 97 | #include "eaw_table.h"
|
| 98 | +#include "rare_cp_bitvec.h" |
98 | 99 |
|
99 | 100 | /* hex character table "0123456789ABCDEF" */
|
100 | 101 | static char mbfl_hexchar_table[] = {
|
@@ -236,26 +237,52 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
|
236 | 237 | /*
|
237 | 238 | * encoding detector
|
238 | 239 | */
|
239 |
| -static int mbfl_estimate_encoding_likelihood(int c, void *void_data) |
| 240 | +static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data) |
240 | 241 | {
|
241 | 242 | mbfl_encoding_detector_data *data = void_data;
|
242 |
| - |
243 |
| - /* Receive wchars decoded from test string using candidate encoding |
244 |
| - * If the test string was invalid in the candidate encoding, we assume |
245 |
| - * it's the wrong one. */ |
| 243 | + unsigned int c = input_cp; |
| 244 | + |
| 245 | + /* Receive wchars decoded from input string using candidate encoding. |
| 246 | + * If the string was invalid in the candidate encoding, we assume |
| 247 | + * it's the wrong one. Otherwise, give the candidate many 'demerits' |
| 248 | + * for each 'rare' codepoint found, a smaller number for each ASCII |
| 249 | + * punctuation character, and 1 for all other codepoints. |
| 250 | + * |
| 251 | + * The 'common' codepoints should cover the vast majority of |
| 252 | + * codepoints we are likely to see in practice, while only covering |
| 253 | + * a small minority of the entire Unicode encoding space. Why? |
| 254 | + * Well, if the test string happens to be valid in an incorrect |
| 255 | + * candidate encoding, the bogus codepoints which it decodes to will |
| 256 | + * be more or less random. By treating the majority of codepoints as |
| 257 | + * 'rare', we ensure that in almost all such cases, the bogus |
| 258 | + * codepoints will include plenty of 'rares', thus giving the |
| 259 | + * incorrect candidate encoding lots of demerits. See |
| 260 | + * common_codepoints.txt for the actual list used. |
| 261 | + * |
| 262 | + * So, why give extra demerits for ASCII punctuation characters? It's |
| 263 | + * because there are some text encodings, like UTF-7, HZ, and ISO-2022, |
| 264 | + * which deliberately only use bytes in the ASCII range. When |
| 265 | + * misinterpreted as ASCII/UTF-8, strings in these encodings will |
| 266 | + * have an unusually high number of ASCII punctuation characters. |
| 267 | + * So giving extra demerits for such characters will improve |
| 268 | + * detection accuracy for UTF-7 and similar encodings. |
| 269 | + * |
| 270 | + * Finally, why 1 demerit for all other characters? That penalizes |
| 271 | + * long strings, meaning we will tend to choose a candidate encoding |
| 272 | + * in which the test string decodes to a smaller number of |
| 273 | + * codepoints. That prevents single-byte encodings in which almost |
| 274 | + * every possible input byte decodes to a 'common' codepoint from |
| 275 | + * being favored too much. */ |
246 | 276 | if (c == MBFL_BAD_INPUT) {
|
247 | 277 | data->num_illegalchars++;
|
248 |
| - } else if (c < 0x9 || (c >= 0xE && c <= 0x1F) || (c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) { |
249 |
| - /* Otherwise, count how many control characters and 'private use' |
250 |
| - * codepoints we see. Those are rarely used and may indicate that |
251 |
| - * the candidate encoding is not the right one. */ |
252 |
| - data->score += 10; |
253 |
| - } else if ((c >= 0x21 && c <= 0x2F) || (c >= 0x3A && c <= 0x40) || (c >= 0x5B && c <= 0x60)) { |
254 |
| - /* Punctuation is also less common than letters/digits; further, if |
255 |
| - * text in ISO-2022 or similar encodings is mistakenly identified as |
256 |
| - * ASCII or UTF-8, the misinterpreted string will tend to have an |
257 |
| - * unusually high density of ASCII punctuation characters. */ |
258 |
| - data->score++; |
| 278 | + } else if (c > 0xFFFF) { |
| 279 | + data->score += 40; |
| 280 | + } else if (c >= 0x21 && c <= 0x2F) { |
| 281 | + data->score += 6; |
| 282 | + } else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) { |
| 283 | + data->score += 30; |
| 284 | + } else { |
| 285 | + data->score += 1; |
259 | 286 | }
|
260 | 287 | return 0;
|
261 | 288 | }
|
|
0 commit comments