Skip to content

Commit e4ce8a9

Browse files
committed
auto merge of #12314 : huonw/rust/is_utf8_iter, r=kballard
See the commit messages for more details, but this makes `std::str::is_utf8` slightly faster and 100% non-`unsafe` and uses a similar thing to make the first scan of `from_utf8_lossy` 100% safe & faster.
2 parents b3ed38f + a39056e commit e4ce8a9

File tree

1 file changed

+70
-45
lines changed

1 file changed

+70
-45
lines changed

src/libstd/str.rs

+70-45
Original file line numberDiff line numberDiff line change
@@ -731,29 +731,38 @@ pub fn eq(a: &~str, b: &~str) -> bool {
731731
Section: Misc
732732
*/
733733

734-
/// Determines if a vector of bytes contains valid UTF-8
735-
pub fn is_utf8(v: &[u8]) -> bool {
736-
first_non_utf8_index(v).is_none()
737-
}
738-
734+
/// Walk through `iter` checking that it's a valid UTF-8 sequence,
735+
/// returning `true` in that case, or, if it is invalid, `false` with
736+
/// `iter` reset such that it is pointing at the first byte in the
737+
/// invalid sequence.
739738
#[inline(always)]
740-
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
741-
let mut i = 0u;
742-
let total = v.len();
743-
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
744-
unsafe { *xs.unsafe_ref(i) }
745-
}
746-
while i < total {
747-
let v_i = unsafe_get(v, i);
748-
if v_i < 128u8 {
749-
i += 1u;
750-
} else {
751-
let w = utf8_char_width(v_i);
752-
if w == 0u { return Some(i); }
739+
fn run_utf8_validation_iterator(iter: &mut vec::Items<u8>) -> bool {
740+
loop {
741+
// save the current thing we're pointing at.
742+
let old = *iter;
743+
744+
// restore the iterator we had at the start of this codepoint.
745+
macro_rules! err ( () => { {*iter = old; return false} });
746+
macro_rules! next ( () => {
747+
match iter.next() {
748+
Some(a) => *a,
749+
// we needed data, but there was none: error!
750+
None => err!()
751+
}
752+
});
753753

754-
let nexti = i + w;
755-
if nexti > total { return Some(i); }
754+
let first = match iter.next() {
755+
Some(&b) => b,
756+
// we're at the end of the iterator and a codepoint
757+
// boundary at the same time, so this string is valid.
758+
None => return true
759+
};
756760

761+
// ASCII characters are always valid, so only large
762+
// bytes need more examination.
763+
if first >= 128 {
764+
let w = utf8_char_width(first);
765+
let second = next!();
757766
// 2-byte encoding is for codepoints \u0080 to \u07ff
758767
// first C2 80 last DF BF
759768
// 3-byte encoding is for codepoints \u0800 to \uffff
@@ -772,35 +781,51 @@ fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
772781
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
773782
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
774783
// %xF4 %x80-8F 2( UTF8-tail )
775-
// UTF8-tail = %x80-BF
776784
match w {
777-
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
778-
return Some(i)
779-
},
780-
3 => match (v_i,
781-
unsafe_get(v, i + 1),
782-
unsafe_get(v, i + 2) & 192u8) {
783-
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
784-
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
785-
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
786-
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
787-
_ => return Some(i),
788-
},
789-
_ => match (v_i,
790-
unsafe_get(v, i + 1),
791-
unsafe_get(v, i + 2) & 192u8,
792-
unsafe_get(v, i + 3) & 192u8) {
793-
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
794-
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
795-
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
796-
_ => return Some(i)
797-
},
785+
2 => if second & 192 != TAG_CONT_U8 {err!()},
786+
3 => {
787+
match (first, second, next!() & 192) {
788+
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
789+
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
790+
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
791+
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
792+
_ => err!()
793+
}
794+
}
795+
4 => {
796+
match (first, second, next!() & 192, next!() & 192) {
797+
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
798+
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
799+
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
800+
_ => err!()
801+
}
802+
}
803+
_ => err!()
798804
}
799-
800-
i = nexti;
801805
}
802806
}
803-
None
807+
}
808+
809+
/// Determines if a vector of bytes contains valid UTF-8.
810+
pub fn is_utf8(v: &[u8]) -> bool {
811+
run_utf8_validation_iterator(&mut v.iter())
812+
}
813+
814+
#[inline(always)]
815+
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
816+
let mut it = v.iter();
817+
818+
let ok = run_utf8_validation_iterator(&mut it);
819+
if ok {
820+
None
821+
} else {
822+
// work out how many valid bytes we've consumed
823+
// (run_utf8_validation_iterator resets the iterator to just
824+
// after the last good byte), which we can do because the
825+
// vector iterator size_hint is exact.
826+
let (remaining, _) = it.size_hint();
827+
Some(v.len() - remaining)
828+
}
804829
}
805830

806831
/// Determines if a vector of `u16` contains valid UTF-16

0 commit comments

Comments
 (0)