@@ -731,29 +731,38 @@ pub fn eq(a: &~str, b: &~str) -> bool {
731
731
Section: Misc
732
732
*/
733
733
734
- /// Determines if a vector of bytes contains valid UTF-8
735
- pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
736
- first_non_utf8_index ( v) . is_none ( )
737
- }
738
-
734
+ /// Walk through `iter` checking that it's a valid UTF-8 sequence,
735
+ /// returning `true` in that case, or, if it is invalid, `false` with
736
+ /// `iter` reset such that it is pointing at the first byte in the
737
+ /// invalid sequence.
739
738
#[ inline( always) ]
740
- fn first_non_utf8_index ( v : & [ u8 ] ) -> Option < uint > {
741
- let mut i = 0 u;
742
- let total = v. len ( ) ;
743
- fn unsafe_get ( xs : & [ u8 ] , i : uint ) -> u8 {
744
- unsafe { * xs. unsafe_ref ( i) }
745
- }
746
- while i < total {
747
- let v_i = unsafe_get ( v, i) ;
748
- if v_i < 128u8 {
749
- i += 1 u;
750
- } else {
751
- let w = utf8_char_width ( v_i) ;
752
- if w == 0 u { return Some ( i) ; }
739
+ fn run_utf8_validation_iterator ( iter : & mut vec:: Items < u8 > ) -> bool {
740
+ loop {
741
+ // save the current thing we're pointing at.
742
+ let old = * iter;
743
+
744
+ // restore the iterator we had at the start of this codepoint.
745
+ macro_rules! err ( ( ) => { { * iter = old; return false } } ) ;
746
+ macro_rules! next ( ( ) => {
747
+ match iter. next( ) {
748
+ Some ( a) => * a,
749
+ // we needed data, but there was none: error!
750
+ None => err!( )
751
+ }
752
+ } ) ;
753
753
754
- let nexti = i + w;
755
- if nexti > total { return Some ( i) ; }
754
+ let first = match iter. next ( ) {
755
+ Some ( & b) => b,
756
+ // we're at the end of the iterator and a codepoint
757
+ // boundary at the same time, so this string is valid.
758
+ None => return true
759
+ } ;
756
760
761
+ // ASCII characters are always valid, so only large
762
+ // bytes need more examination.
763
+ if first >= 128 {
764
+ let w = utf8_char_width ( first) ;
765
+ let second = next ! ( ) ;
757
766
// 2-byte encoding is for codepoints \u0080 to \u07ff
758
767
// first C2 80 last DF BF
759
768
// 3-byte encoding is for codepoints \u0800 to \uffff
@@ -772,35 +781,51 @@ fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
772
781
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
773
782
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
774
783
// %xF4 %x80-8F 2( UTF8-tail )
775
- // UTF8-tail = %x80-BF
776
784
match w {
777
- 2 => if unsafe_get ( v, i + 1 ) & 192u8 != TAG_CONT_U8 {
778
- return Some ( i)
779
- } ,
780
- 3 => match ( v_i,
781
- unsafe_get ( v, i + 1 ) ,
782
- unsafe_get ( v, i + 2 ) & 192u8 ) {
783
- ( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
784
- ( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
785
- ( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) => ( ) ,
786
- ( 0xEE .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
787
- _ => return Some ( i) ,
788
- } ,
789
- _ => match ( v_i,
790
- unsafe_get ( v, i + 1 ) ,
791
- unsafe_get ( v, i + 2 ) & 192u8 ,
792
- unsafe_get ( v, i + 3 ) & 192u8 ) {
793
- ( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
794
- ( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
795
- ( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
796
- _ => return Some ( i)
797
- } ,
785
+ 2 => if second & 192 != TAG_CONT_U8 { err ! ( ) } ,
786
+ 3 => {
787
+ match ( first, second, next ! ( ) & 192 ) {
788
+ ( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) |
789
+ ( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) |
790
+ ( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) |
791
+ ( 0xEE .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => { }
792
+ _ => err ! ( )
793
+ }
794
+ }
795
+ 4 => {
796
+ match ( first, second, next ! ( ) & 192 , next ! ( ) & 192 ) {
797
+ ( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
798
+ ( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
799
+ ( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
800
+ _ => err ! ( )
801
+ }
802
+ }
803
+ _ => err ! ( )
798
804
}
799
-
800
- i = nexti;
801
805
}
802
806
}
803
- None
807
+ }
808
+
809
+ /// Determines if a vector of bytes contains valid UTF-8.
810
+ pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
811
+ run_utf8_validation_iterator ( & mut v. iter ( ) )
812
+ }
813
+
814
+ #[ inline( always) ]
815
+ fn first_non_utf8_index ( v : & [ u8 ] ) -> Option < uint > {
816
+ let mut it = v. iter ( ) ;
817
+
818
+ let ok = run_utf8_validation_iterator ( & mut it) ;
819
+ if ok {
820
+ None
821
+ } else {
822
+ // work out how many valid bytes we've consumed
823
+ // (run_utf8_validation_iterator resets the iterator to just
824
+ // after the last good byte), which we can do because the
825
+ // vector iterator size_hint is exact.
826
+ let ( remaining, _) = it. size_hint ( ) ;
827
+ Some ( v. len ( ) - remaining)
828
+ }
804
829
}
805
830
806
831
/// Determines if a vector of `u16` contains valid UTF-16
0 commit comments