Skip to content

Commit 9962aa9

Browse files
committed
Merge branch 'PHP-8.1'
* PHP-8.1: mb_detect_encoding will not return non-encodings Improve detection accuracy of mb_detect_encoding
2 parents 6b2b4bb + a2bc57e commit 9962aa9

File tree

8 files changed

+7585
-71
lines changed

8 files changed

+7585
-71
lines changed

ext/mbstring/common_codepoints.txt

Lines changed: 6994 additions & 0 deletions
Large diffs are not rendered by default.

ext/mbstring/gen_rare_cp_bitvec.php

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env php
2+
<?php
3+
4+
if ($argc < 2) {
5+
echo "Usage: php gen_rare_cp_bitvec.php ./common_codepoints.txt\n";
6+
return;
7+
}
8+
9+
$bitvec = array_fill(0, (0xFFFF / 32) + 1, 0xFFFFFFFF);
10+
11+
$input = file_get_contents($argv[1]);
12+
foreach (explode("\n", $input) as $line) {
13+
if (false !== $hashPos = strpos($line, '#')) {
14+
$line = substr($line, 0, $hashPos);
15+
}
16+
17+
$line = trim($line);
18+
if ($line === '') {
19+
continue;
20+
}
21+
22+
$range = explode("\t", $line);
23+
$start = hexdec($range[0]);
24+
$end = hexdec($range[1]);
25+
26+
for ($i = $start; $i <= $end; $i++) {
27+
$bitvec[$i >> 5] &= ~(1 << ($i & 0x1F));
28+
}
29+
}
30+
31+
$result = <<<'HEADER'
32+
/* Machine-generated file; do not edit! See gen_rare_cp_bitvec.php.
33+
*
34+
* The below array has one bit for each Unicode codepoint from U+0000 to U+FFFF.
35+
* The bit is 1 if the codepoint is considered 'rare' for the purpose of
36+
* guessing the text encoding of a string.
37+
*
38+
* Each 'rare' codepoint which appears in a string when it is interpreted
39+
* using a candidate encoding causes the candidate encoding to be treated
40+
* as less likely to be the correct one.
41+
*/
42+
43+
static uint32_t rare_codepoint_bitvec[] = {
44+
HEADER;
45+
46+
for ($i = 0; $i < 0xFFFF / 32; $i++) {
47+
if ($i % 8 === 0) {
48+
$result .= "\n";
49+
} else {
50+
$result .= " ";
51+
}
52+
53+
$result .= "0x" . str_pad(dechex($bitvec[$i]), 8, '0', STR_PAD_LEFT) . ",";
54+
}
55+
56+
$result .= "\n};\n";
57+
58+
file_put_contents(__DIR__ . '/rare_cp_bitvec.h', $result);
59+
60+
echo "Done.\n";
61+
?>

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
#include "filters/mbfilter_utf8.h"
9696

9797
#include "eaw_table.h"
98+
#include "rare_cp_bitvec.h"
9899

99100
/* hex character table "0123456789ABCDEF" */
100101
static char mbfl_hexchar_table[] = {
@@ -236,26 +237,52 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
236237
/*
237238
* encoding detector
238239
*/
239-
static int mbfl_estimate_encoding_likelihood(int c, void *void_data)
240+
static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
240241
{
241242
mbfl_encoding_detector_data *data = void_data;
242-
243-
/* Receive wchars decoded from test string using candidate encoding
244-
* If the test string was invalid in the candidate encoding, we assume
245-
* it's the wrong one. */
243+
unsigned int c = input_cp;
244+
245+
/* Receive wchars decoded from input string using candidate encoding.
246+
* If the string was invalid in the candidate encoding, we assume
247+
* it's the wrong one. Otherwise, give the candidate many 'demerits'
248+
* for each 'rare' codepoint found, a smaller number for each ASCII
249+
* punctuation character, and 1 for all other codepoints.
250+
*
251+
* The 'common' codepoints should cover the vast majority of
252+
* codepoints we are likely to see in practice, while only covering
253+
* a small minority of the entire Unicode encoding space. Why?
254+
* Well, if the test string happens to be valid in an incorrect
255+
* candidate encoding, the bogus codepoints which it decodes to will
256+
* be more or less random. By treating the majority of codepoints as
257+
* 'rare', we ensure that in almost all such cases, the bogus
258+
* codepoints will include plenty of 'rares', thus giving the
259+
* incorrect candidate encoding lots of demerits. See
260+
* common_codepoints.txt for the actual list used.
261+
*
262+
* So, why give extra demerits for ASCII punctuation characters? It's
263+
* because there are some text encodings, like UTF-7, HZ, and ISO-2022,
264+
* which deliberately only use bytes in the ASCII range. When
265+
* misinterpreted as ASCII/UTF-8, strings in these encodings will
266+
* have an unusually high number of ASCII punctuation characters.
267+
* So giving extra demerits for such characters will improve
268+
* detection accuracy for UTF-7 and similar encodings.
269+
*
270+
* Finally, why 1 demerit for all other characters? That penalizes
271+
* long strings, meaning we will tend to choose a candidate encoding
272+
* in which the test string decodes to a smaller number of
273+
* codepoints. That prevents single-byte encodings in which almost
274+
* every possible input byte decodes to a 'common' codepoint from
275+
* being favored too much. */
246276
if (c == MBFL_BAD_INPUT) {
247277
data->num_illegalchars++;
248-
} else if (c < 0x9 || (c >= 0xE && c <= 0x1F) || (c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
249-
/* Otherwise, count how many control characters and 'private use'
250-
* codepoints we see. Those are rarely used and may indicate that
251-
* the candidate encoding is not the right one. */
252-
data->score += 10;
253-
} else if ((c >= 0x21 && c <= 0x2F) || (c >= 0x3A && c <= 0x40) || (c >= 0x5B && c <= 0x60)) {
254-
/* Punctuation is also less common than letters/digits; further, if
255-
* text in ISO-2022 or similar encodings is mistakenly identified as
256-
* ASCII or UTF-8, the misinterpreted string will tend to have an
257-
* unusually high density of ASCII punctuation characters. */
258-
data->score++;
278+
} else if (c > 0xFFFF) {
279+
data->score += 40;
280+
} else if (c >= 0x21 && c <= 0x2F) {
281+
data->score += 6;
282+
} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
283+
data->score += 30;
284+
} else {
285+
data->score += 1;
259286
}
260287
return 0;
261288
}

ext/mbstring/mbstring.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2662,6 +2662,23 @@ PHP_FUNCTION(mb_strtolower)
26622662
}
26632663
/* }}} */
26642664

2665+
static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2666+
{
2667+
/* mbstring supports some 'text encodings' which aren't really text encodings
2668+
* at all, but really 'byte encodings', like Base64, QPrint, and so on.
2669+
* These should never be returned by `mb_detect_encoding`. */
2670+
int shift = 0;
2671+
for (int i = 0; i < *size; i++) {
2672+
const mbfl_encoding *encoding = elist[i];
2673+
if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2674+
shift++; /* Remove this encoding from the list */
2675+
} else if (shift) {
2676+
elist[i - shift] = encoding;
2677+
}
2678+
}
2679+
*size -= shift;
2680+
}
2681+
26652682
/* {{{ Encodings of the given string is returned (as a string) */
26662683
PHP_FUNCTION(mb_detect_encoding)
26672684
{
@@ -2707,6 +2724,14 @@ PHP_FUNCTION(mb_detect_encoding)
27072724
RETURN_THROWS();
27082725
}
27092726

2727+
if (free_elist) {
2728+
remove_non_encodings_from_elist(elist, &size);
2729+
if (size == 0) {
2730+
efree(ZEND_VOIDP(elist));
2731+
RETURN_FALSE;
2732+
}
2733+
}
2734+
27102735
if (ZEND_NUM_ARGS() < 3) {
27112736
strict = MBSTRG(strict_detection);
27122737
}

0 commit comments

Comments
 (0)