Skip to content

Commit 3ab72a4

Browse files
committed
Merge branch 'PHP-8.2'
* PHP-8.2: Use different mblen_table for different SJIS variants Correct entry for 0x80,0xFD-FF in SJIS multi-byte character length table
2 parents 6f785b0 + 1751f34 commit 3ab72a4

File tree

6 files changed

+168
-31
lines changed

6 files changed

+168
-31
lines changed

ext/mbstring/libmbfl/filters/mbfilter_sjis.c

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ static void mb_wchar_to_sjis_kddi(uint32_t *in, size_t len, mb_convert_buf *buf,
6161
static size_t mb_sjis_sb_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
6262
static void mb_wchar_to_sjis_sb(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
6363

64-
const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
64+
const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */
6565
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6666
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6767
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -70,14 +70,52 @@ const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
7070
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7171
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
7272
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
73+
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
7374
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
78+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
79+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
81+
};
82+
83+
const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */
84+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92+
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
93+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
94+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
95+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
96+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
98+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
99+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
100+
};
101+
102+
const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */
103+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111+
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
74112
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75113
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76114
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77115
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
78116
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
79117
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80-
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
118+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
81119
};
82120

83121
static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL};
@@ -122,7 +160,7 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
122160
"SJIS-mac",
123161
"Shift_JIS",
124162
mbfl_encoding_sjis_mac_aliases,
125-
mblen_table_sjis,
163+
mblen_table_sjismac,
126164
MBFL_ENCTYPE_GL_UNSAFE,
127165
&vtbl_sjis_mac_wchar,
128166
&vtbl_wchar_sjis_mac,
@@ -159,7 +197,7 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
159197
"SJIS-Mobile#DOCOMO",
160198
"Shift_JIS",
161199
mbfl_encoding_sjis_docomo_aliases,
162-
mblen_table_sjis,
200+
mblen_table_sjis_mobile,
163201
MBFL_ENCTYPE_GL_UNSAFE,
164202
&vtbl_sjis_docomo_wchar,
165203
&vtbl_wchar_sjis_docomo,
@@ -172,7 +210,7 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
172210
"SJIS-Mobile#KDDI",
173211
"Shift_JIS",
174212
mbfl_encoding_sjis_kddi_aliases,
175-
mblen_table_sjis,
213+
mblen_table_sjis_mobile,
176214
MBFL_ENCTYPE_GL_UNSAFE,
177215
&vtbl_sjis_kddi_wchar,
178216
&vtbl_wchar_sjis_kddi,
@@ -185,7 +223,7 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
185223
"SJIS-Mobile#SOFTBANK",
186224
"Shift_JIS",
187225
mbfl_encoding_sjis_sb_aliases,
188-
mblen_table_sjis,
226+
mblen_table_sjis_mobile,
189227
MBFL_ENCTYPE_GL_UNSAFE,
190228
&vtbl_sjis_sb_wchar,
191229
&vtbl_wchar_sjis_sb,

ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
#include "unicode_table_jis2004.h"
4242
#include "unicode_table_jis.h"
4343

44-
extern const unsigned char mblen_table_sjis[];
44+
extern const unsigned char mblen_table_sjis_mobile[];
4545
extern const unsigned char mblen_table_eucjp[];
4646

4747
static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
@@ -62,7 +62,7 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
6262
"SJIS-2004",
6363
"Shift_JIS",
6464
mbfl_encoding_sjis2004_aliases,
65-
mblen_table_sjis,
65+
mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
6666
MBFL_ENCTYPE_GL_UNSAFE,
6767
&vtbl_sjis2004_wchar,
6868
&vtbl_wchar_sjis2004,

ext/mbstring/tests/mb_str_split_jp.phpt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,17 @@ if(end($array) !== $enc){
6969
last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]);
7070
}
7171

72+
/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */
73+
echo "== Regression test for SJIS byte 0x80 ==\n";
74+
foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) {
75+
$array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding);
76+
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
77+
78+
// Also try bytes 0xFD, 0xFE, and 0xFF
79+
$array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding);
80+
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
81+
}
82+
7283
?>
7384
--EXPECT--
7485
BIG-5: a4e9 a5bb
@@ -80,3 +91,16 @@ UTF-16LE: e565 2c67
8091
UTF-32BE: 000065e5 0000672c
8192
UTF-32LE: e5650000 2c670000
8293
UTF-8: e697a5 e69cac
94+
== Regression test for SJIS byte 0x80 ==
95+
SJIS: [80a1, 6162, 6380, a1]
96+
SJIS: [6162, 63fd, feff, 6162, fdfe, ff]
97+
SJIS-2004: [80a1, 6162, 6380, a1]
98+
SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff]
99+
MacJapanese: [80a1, 6162, 6380, a1]
100+
MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff]
101+
SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1]
102+
SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff]
103+
SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
104+
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
105+
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
106+
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]

ext/mbstring/tests/mb_strlen.phpt

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,43 +13,59 @@ include_once('common.inc');
1313
mb_detect_order('auto');
1414

1515
// Test string
16-
$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。';
16+
$euc_jp = mb_convert_encoding("0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。", 'EUC-JP', 'UTF-8');
1717
$ascii = 'abcdefghijklmnopqrstuvwxyz;]=#0123456789';
1818

19-
// ASCII
2019
echo "== ASCII ==\n";
21-
print mb_strlen($ascii,'ASCII') . "\n";
22-
print strlen($ascii) . "\n";
20+
print mb_strlen($ascii,'ASCII') . "\n";
21+
print strlen($ascii) . "\n";
2322

24-
// EUC-JP
2523
echo "== EUC-JP ==\n";
26-
print mb_strlen($euc_jp,'EUC-JP') . "\n";
24+
print mb_strlen($euc_jp,'EUC-JP') . "\n";
2725
mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n");
28-
print strlen($euc_jp) . "\n";
26+
print strlen($euc_jp) . "\n";
2927

30-
// SJIS
3128
echo "== SJIS ==\n";
3229
$sjis = mb_convert_encoding($euc_jp, 'SJIS','EUC-JP');
33-
print mb_strlen($sjis,'SJIS') . "\n";
30+
print mb_strlen($sjis,'SJIS') . "\n";
3431
mb_internal_encoding('SJIS') or print("mb_internal_encoding() failed\n");
35-
print strlen($sjis) . "\n";
32+
print strlen($sjis) . "\n";
33+
print "-- Testing illegal bytes 0x80,0xFD-FF --\n";
34+
// mb_strlen used to wrongly treat 0x80 as the starting byte of a 2-byte SJIS character
35+
print mb_strlen("\x80\xA1", 'SJIS') . "\n";
36+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS') . "\n";
37+
38+
echo "== MacJapanese ==\n";
39+
print mb_strlen("\x80\xA1", 'MacJapanese') . "\n";
40+
print mb_strlen("abc\xFD\xFE\xFF", 'MacJapanese') . "\n";
41+
42+
echo "== SJIS-2004 ==\n";
43+
print mb_strlen("\x80\xA1", 'SJIS-2004') . "\n";
44+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-2004') . "\n";
45+
46+
echo "== SJIS-Mobile#DOCOMO ==\n";
47+
print mb_strlen("\x80\xA1", 'SJIS-Mobile#DOCOMO') . "\n";
48+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#DOCOMO') . "\n";
49+
50+
echo "== SJIS-Mobile#KDDI ==\n";
51+
print mb_strlen("\x80\xA1", 'SJIS-Mobile#KDDI') . "\n";
52+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#KDDI') . "\n";
53+
54+
echo "== SJIS-Mobile#SoftBank ==\n";
55+
print mb_strlen("\x80\xA1", 'SJIS-Mobile#SoftBank') . "\n";
56+
print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#SoftBank') . "\n";
3657

37-
// JIS
38-
// Note: either convert_encoding or strlen has problem
3958
echo "== JIS ==\n";
4059
$jis = mb_convert_encoding($euc_jp, 'JIS','EUC-JP');
41-
print mb_strlen($jis,'JIS') . "\n";
60+
print mb_strlen($jis,'JIS') . "\n";
4261
mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n");
43-
print strlen($jis) . "\n";
62+
print strlen($jis) . "\n";
4463

45-
// UTF-8
46-
// Note: either convert_encoding or strlen has problem
4764
echo "== UTF-8 ==\n";
4865
$utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP');
49-
print mb_strlen($utf8,'UTF-8') . "\n";
66+
print mb_strlen($utf8,'UTF-8') . "\n";
5067
mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
51-
print strlen($utf8) . "\n";
52-
68+
print strlen($utf8) . "\n";
5369

5470
// Wrong Parameters
5571
echo "== WRONG PARAMETERS ==\n";
@@ -72,6 +88,24 @@ try {
7288
== SJIS ==
7389
43
7490
72
91+
-- Testing illegal bytes 0x80,0xFD-FF --
92+
2
93+
6
94+
== MacJapanese ==
95+
2
96+
6
97+
== SJIS-2004 ==
98+
2
99+
6
100+
== SJIS-Mobile#DOCOMO ==
101+
2
102+
6
103+
== SJIS-Mobile#KDDI ==
104+
2
105+
6
106+
== SJIS-Mobile#SoftBank ==
107+
2
108+
6
75109
== JIS ==
76110
43
77111
90

ext/mbstring/tests/mb_strstr.phpt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,4 @@ string(18) "おかきくけこ"
3737
string(18) "おかきくけこ"
3838
string(12) "あいうえ"
3939
string(4) "dd00"
40-
string(0) ""
40+
string(2) "00"

ext/mbstring/tests/mb_substr.phpt

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ ini_set('include_path','.');
88
include_once('common.inc');
99

1010
// EUC-JP
11-
$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA3\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3";
11+
$euc_jp = mb_convert_encoding('0123この文字列は日本語です。EUC-JPを使っています。日本語は面倒臭い。', 'EUC-JP', 'UTF-8');
1212
// SJIS
13-
$sjis = "\x93\xFA\x96{\x8C\xEA\x83e\x83L\x83X\x83g\x82\xC5\x82\xB7\x81B01234\x82T\x82U\x82V\x82W\x82X\x81B";
13+
$sjis = mb_convert_encoding('日本語テキストです。0123456789。', 'SJIS', 'UTF-8');
1414
// ISO-2022-JP
1515
$iso2022jp = "\x1B\$B\x21\x21!r\x1B(BABC";
1616
// GB-18030
17-
$gb18030 = "\xC3\xDC\xC2\xEB\xD3\xC3\xBB\xA7\xC3\xFB\xC3\xDC\xC2\xEB\xC3\xFB\xB3\xC6\xC3\xFB\xB3\xC6";
17+
$gb18030 = mb_convert_encoding('密码用户名密码名称名称', 'GB18030', 'UTF-8');
1818
// HZ
1919
$hz = "The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye.";
2020
// UTF-8
@@ -40,6 +40,29 @@ print "2: " . bin2hex(mb_substr($sjis, -1, null, 'SJIS')) . "\n";
4040
print "3: " . bin2hex(mb_substr($sjis, -5, 3, 'SJIS')) . "\n";
4141
print "4: " . bin2hex(mb_substr($sjis, 1, null, 'SJIS')) . "\n";
4242
print "5:" . bin2hex(mb_substr($sjis, 10, 0, 'SJIS')) . "\n";
43+
echo "-- Testing illegal SJIS byte 0x80 --\n";
44+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS')) . "\n";
45+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS')) . "\n";
46+
47+
echo "SJIS-2004:\n";
48+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-2004')) . "\n";
49+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-2004')) . "\n";
50+
51+
echo "MacJapanese:\n";
52+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'MacJapanese')) . "\n";
53+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'MacJapanese')) . "\n";
54+
55+
echo "SJIS-Mobile#DOCOMO:\n";
56+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#DOCOMO')) . "\n";
57+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#DOCOMO')) . "\n";
58+
59+
echo "SJIS-Mobile#KDDI:\n";
60+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#KDDI')) . "\n";
61+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#KDDI')) . "\n";
62+
63+
echo "SJIS-Mobile#SoftBank:\n";
64+
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
65+
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
4366

4467
echo "ISO-2022-JP:\n";
4568
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
@@ -104,6 +127,24 @@ SJIS:
104127
3: 825582568257
105128
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
106129
5:
130+
-- Testing illegal SJIS byte 0x80 --
131+
6380
132+
806162
133+
SJIS-2004:
134+
6380
135+
806162
136+
MacJapanese:
137+
6380
138+
806162
139+
SJIS-Mobile#DOCOMO:
140+
6380
141+
806162
142+
SJIS-Mobile#KDDI:
143+
6380
144+
806162
145+
SJIS-Mobile#SoftBank:
146+
6380
147+
806162
107148
ISO-2022-JP:
108149
1: 1b2442212121721b284241
109150
2: 43

0 commit comments

Comments
 (0)