Skip to content

std: Speed up str::is_utf8 #8237

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 67 additions & 35 deletions src/libstd/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -564,51 +564,63 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
Section: Misc
*/

// Return the initial codepoint accumulator for the first byte.
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
// for width 3, and 3 bits for width 4
macro_rules! utf8_first_byte(
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
)

// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
)

/// Determines if a vector of bytes contains valid UTF-8
pub fn is_utf8(v: &[u8]) -> bool {
let mut i = 0u;
let total = v.len();
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
unsafe { *xs.unsafe_ref(i) }
}
while i < total {
if v[i] < 128u8 {
let v_i = unsafe_get(v, i);
if v_i < 128u8 {
i += 1u;
} else {
let w = utf8_char_width(v[i]);
let w = utf8_char_width(v_i);
if w == 0u { return false; }

let nexti = i + w;
if nexti > total { return false; }
// 1. Make sure the correct number of continuation bytes are present
// 2. Check codepoint ranges (deny overlong encodings)
// 2-byte encoding is for codepoints \u0080 to \u07ff
// 3-byte encoding is for codepoints \u0800 to \uffff
// 4-byte encoding is for codepoints \u10000 to \u10ffff

// 2-byte encodings are correct if the width and continuation match up
if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
if w > 2 {
let mut ch;
ch = utf8_first_byte!(v[i], w);
ch = utf8_acc_cont_byte!(ch, v[i + 1]);
if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
ch = utf8_acc_cont_byte!(ch, v[i + 2]);
if w == 3 && ch < MAX_TWO_B { return false; }
if w > 3 {
if v[i + 3] & 192u8 != TAG_CONT_U8 { return false; }
ch = utf8_acc_cont_byte!(ch, v[i + 3]);
if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false; }
}

// 2-byte encoding is for codepoints \u0080 to \u07ff
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u0800 to \uffff
// first E0 A0 80 last EF BF BF
// 4-byte encoding is for codepoints \u10000 to \u10ffff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
// UTF8-tail = %x80-BF
// --
// This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
match w {
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
return false
},
3 => match (v_i,
unsafe_get(v, i + 1),
unsafe_get(v, i + 2) & 192u8) {
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
(0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
_ => return false,
},
_ => match (v_i,
unsafe_get(v, i + 1),
unsafe_get(v, i + 2) & 192u8,
unsafe_get(v, i + 3) & 192u8) {
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
_ => return false,
},
}

i = nexti;
Expand Down Expand Up @@ -756,6 +768,18 @@ pub struct CharRange {
next: uint
}

// Return the initial codepoint accumulator for the first byte.
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
// for width 3, and 3 bits for width 4
macro_rules! utf8_first_byte(
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
)

// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
)

// UTF-8 tags and ranges
priv static TAG_CONT_U8: u8 = 128u8;
priv static TAG_CONT: uint = 128u;
Expand Down Expand Up @@ -2833,13 +2857,21 @@ mod tests {
}

#[test]
fn test_is_utf8_deny_overlong() {
fn test_is_utf8() {
assert!(!is_utf8([0xc0, 0x80]));
assert!(!is_utf8([0xc0, 0xae]));
assert!(!is_utf8([0xe0, 0x80, 0x80]));
assert!(!is_utf8([0xe0, 0x80, 0xaf]));
assert!(!is_utf8([0xe0, 0x81, 0x81]));
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));

assert!(is_utf8([0xC2, 0x80]));
assert!(is_utf8([0xDF, 0xBF]));
assert!(is_utf8([0xE0, 0xA0, 0x80]));
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
}


Expand Down