Skip to content

Commit 4dfbefb

Browse files
committed
Add ASCII check for mb_check_encoding when UTF-8
1 parent ec0028e commit 4dfbefb

File tree

1 file changed

+28
-5
lines changed

1 file changed

+28
-5
lines changed

ext/mbstring/mbstring.c

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4788,6 +4788,7 @@ bool utf8_range(const unsigned char *data, size_t len)
47884788
const uint8x16_t const_1 = vdupq_n_u8(1);
47894789
const uint8x16_t const_2 = vdupq_n_u8(2);
47904790
const uint8x16_t const_e0 = vdupq_n_u8(0xE0);
4791+
const uint8x16_t const_7f = vdupq_n_u8(0x7F);
47914792

47924793
/* We use two error registers to remove a dependency. */
47934794
uint8x16_t error1 = vdupq_n_u8(0);
@@ -4799,6 +4800,29 @@ bool utf8_range(const unsigned char *data, size_t len)
47994800
const uint8x16_t input_3 = vld1q_u8(data + 32);
48004801
const uint8x16_t input_4 = vld1q_u8(data + 48);
48014802

4803+
uint64_t ascii_paired = vgetq_lane_u64(vreinterpretq_u64_u8(prev_first_len), 0);
4804+
if (ascii_paired == 0) {
4805+
uint8x16_t is_ascii_0 = vorrq_u8(input_1, input_2);
4806+
is_ascii_0 = vorrq_u8(is_ascii_0, input_3);
4807+
is_ascii_0 = vorrq_u8(is_ascii_0, input_4);
4808+
4809+
uint8x16_t is_ascii = vqsubq_u8(is_ascii_0, const_7f);
4810+
uint64_t is_ascii_paired = vgetq_lane_u64(vreinterpretq_u64_u8(is_ascii), 0);
4811+
4812+
/* ascii */
4813+
if (is_ascii_paired == 0) {
4814+
const uint8x16_t high_nibbles_4 = vshrq_n_u8(input_4, 4);
4815+
const uint8x16_t first_len_4 = vqtbl1q_u8(first_len_tbl, high_nibbles_4);
4816+
4817+
prev_input = input_4;
4818+
prev_first_len = first_len_4;
4819+
4820+
data += 64;
4821+
len -= 64;
4822+
continue;
4823+
}
4824+
}
4825+
48024826
/* high_nibbles = input >> 4 */
48034827
const uint8x16_t high_nibbles_1 = vshrq_n_u8(input_1, 4);
48044828
const uint8x16_t high_nibbles_2 = vshrq_n_u8(input_2, 4);
@@ -4947,8 +4971,7 @@ bool utf8_range(const unsigned char *data, size_t len)
49474971
/* first_len = legal character length minus 1 */
49484972
/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
49494973
/* first_len = first_len_tbl[high_nibbles] */
4950-
const uint8x16_t first_len =
4951-
vqtbl1q_u8(first_len_tbl, high_nibbles);
4974+
const uint8x16_t first_len = vqtbl1q_u8(first_len_tbl, high_nibbles);
49524975

49534976
/* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
49544977
/* range = first_range_tbl[high_nibbles] */
@@ -4957,8 +4980,7 @@ bool utf8_range(const unsigned char *data, size_t len)
49574980
/* Second Byte: set range index to first_len */
49584981
/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
49594982
/* range |= (first_len, prev_first_len) << 1 byte */
4960-
range =
4961-
vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15));
4983+
range = vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15));
49624984

49634985
/* Third Byte: set range index to saturate_sub(first_len, 1) */
49644986
/* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
@@ -5033,8 +5055,9 @@ bool utf8_range(const unsigned char *data, size_t len)
50335055
/* Merge our error counters together */
50345056
error1 = vorrq_u8(error1, error2);
50355057

5058+
uint64_t error_raw_last = vgetq_lane_u64(vreinterpretq_u64_u8(error1), 0);
50365059
/* Delay error check till loop ends */
5037-
if (vmaxvq_u8(error1)) {
5060+
if (error_raw_last != 0) {
50385061
return false;
50395062
}
50405063

0 commit comments

Comments
 (0)