Skip to content

Commit a39056e

Browse files
committed
std: convert first_non_utf8_byte to use the iterator.
This makes it very slightly faster, especially when the string is valid UTF-8, and completely removes the use of `unsafe` from the first half. Before: from_utf8_lossy_100_ascii ... bench: 151 ns/iter (+/- 17) from_utf8_lossy_100_invalid ... bench: 447 ns/iter (+/- 33) from_utf8_lossy_100_multibyte ... bench: 135 ns/iter (+/- 4) from_utf8_lossy_invalid ... bench: 124 ns/iter (+/- 10 After: from_utf8_lossy_100_ascii ... bench: 119 ns/iter (+/- 8) from_utf8_lossy_100_invalid ... bench: 454 ns/iter (+/- 16) from_utf8_lossy_100_multibyte ... bench: 116 ns/iter (+/- 9) from_utf8_lossy_invalid ... bench: 119 ns/iter (+/- 9)
1 parent a68d10e commit a39056e

File tree

1 file changed

+11
-61
lines changed

1 file changed

+11
-61
lines changed

src/libstd/str.rs

Lines changed: 11 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool {
813813

814814
#[inline(always)]
815815
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
816-
let mut i = 0u;
817-
let total = v.len();
818-
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
819-
unsafe { *xs.unsafe_ref(i) }
820-
}
821-
while i < total {
822-
let v_i = unsafe_get(v, i);
823-
if v_i < 128u8 {
824-
i += 1u;
825-
} else {
826-
let w = utf8_char_width(v_i);
827-
if w == 0u { return Some(i); }
828-
829-
let nexti = i + w;
830-
if nexti > total { return Some(i); }
816+
let mut it = v.iter();
831817

832-
// 2-byte encoding is for codepoints \u0080 to \u07ff
833-
// first C2 80 last DF BF
834-
// 3-byte encoding is for codepoints \u0800 to \uffff
835-
// first E0 A0 80 last EF BF BF
836-
// excluding surrogates codepoints \ud800 to \udfff
837-
// ED A0 80 to ED BF BF
838-
// 4-byte encoding is for codepoints \u10000 to \u10ffff
839-
// first F0 90 80 80 last F4 8F BF BF
840-
//
841-
// Use the UTF-8 syntax from the RFC
842-
//
843-
// https://tools.ietf.org/html/rfc3629
844-
// UTF8-1 = %x00-7F
845-
// UTF8-2 = %xC2-DF UTF8-tail
846-
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
847-
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
848-
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
849-
// %xF4 %x80-8F 2( UTF8-tail )
850-
// UTF8-tail = %x80-BF
851-
match w {
852-
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
853-
return Some(i)
854-
},
855-
3 => match (v_i,
856-
unsafe_get(v, i + 1),
857-
unsafe_get(v, i + 2) & 192u8) {
858-
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
859-
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
860-
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
861-
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
862-
_ => return Some(i),
863-
},
864-
_ => match (v_i,
865-
unsafe_get(v, i + 1),
866-
unsafe_get(v, i + 2) & 192u8,
867-
unsafe_get(v, i + 3) & 192u8) {
868-
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
869-
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
870-
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
871-
_ => return Some(i)
872-
},
873-
}
874-
875-
i = nexti;
876-
}
818+
let ok = run_utf8_validation_iterator(&mut it);
819+
if ok {
820+
None
821+
} else {
822+
// work out how many valid bytes we've consumed
823+
// (run_utf8_validation_iterator resets the iterator to just
824+
// after the last good byte), which we can do because the
825+
// vector iterator size_hint is exact.
826+
let (remaining, _) = it.size_hint();
827+
Some(v.len() - remaining)
877828
}
878-
None
879829
}
880830

881831
/// Determines if a vector of `u16` contains valid UTF-16

0 commit comments

Comments
 (0)