@@ -900,16 +900,122 @@ pub struct CharRange {
900
900
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
901
901
// for width 3, and 3 bits for width 4
902
902
macro_rules! utf8_first_byte(
903
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as uint )
903
+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
904
904
)
905
905
906
906
// return the value of $ch updated with continuation byte $byte
907
907
macro_rules! utf8_acc_cont_byte(
908
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as uint )
908
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
909
909
)
910
910
911
911
static TAG_CONT_U8 : u8 = 128u8 ;
912
912
913
+ /// Converts a vector of bytes to a new utf-8 string.
914
+ /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
915
+ ///
916
+ /// # Example
917
+ ///
918
+ /// ```rust
919
+ /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
920
+ /// let output = std::str::from_utf8_lossy(input);
921
+ /// assert_eq!(output, ~"Hello \uFFFDWorld");
922
+ /// ```
923
+ pub fn from_utf8_lossy ( v : & [ u8 ] ) -> ~str {
924
+ static REPLACEMENT : & ' static [ u8 ] = bytes ! ( 0xEF , 0xBF , 0xBD ) ; // U+FFFD in UTF-8
925
+ let mut i = 0 u;
926
+ let mut lastgood = 0 u;
927
+ let total = v. len ( ) ;
928
+ fn unsafe_get ( xs : & [ u8 ] , i : uint ) -> u8 {
929
+ unsafe { * xs. unsafe_ref ( i) }
930
+ }
931
+ fn safe_get ( xs : & [ u8 ] , i : uint , total : uint ) -> u8 {
932
+ if i >= total {
933
+ 0
934
+ } else {
935
+ unsafe_get ( xs, i)
936
+ }
937
+ }
938
+ let mut res = with_capacity ( total) ;
939
+
940
+ while i < total {
941
+ let i_ = i;
942
+ let byte = unsafe_get ( v, i) ;
943
+ i += 1 ;
944
+
945
+ macro_rules! error( ( ) => {
946
+ unsafe {
947
+ if lastgood != i_ {
948
+ raw:: push_bytes( & mut res, v. slice( lastgood, i_) ) ;
949
+ }
950
+ lastgood = i;
951
+ raw:: push_bytes( & mut res, REPLACEMENT ) ;
952
+ }
953
+ } )
954
+
955
+ if byte < 128u8 {
956
+ // lastgood handles this
957
+ } else {
958
+ let w = utf8_char_width ( byte) ;
959
+
960
+ match w {
961
+ 2 => {
962
+ if safe_get ( v, i, total) & 192u8 != TAG_CONT_U8 {
963
+ error ! ( ) ;
964
+ continue ;
965
+ }
966
+ i += 1 ;
967
+ }
968
+ 3 => {
969
+ match ( byte, safe_get ( v, i, total) ) {
970
+ ( 0xE0 , 0xA0 .. 0xBF ) => ( ) ,
971
+ ( 0xE1 .. 0xEC , 0x80 .. 0xBF ) => ( ) ,
972
+ ( 0xED , 0x80 .. 0x9F ) => ( ) ,
973
+ ( 0xEE .. 0xEF , 0x80 .. 0xBF ) => ( ) ,
974
+ _ => {
975
+ error ! ( ) ;
976
+ continue ;
977
+ }
978
+ }
979
+ i += 1 ;
980
+ if safe_get ( v, i, total) & 192u8 != TAG_CONT_U8 {
981
+ error ! ( ) ;
982
+ continue ;
983
+ }
984
+ i += 1 ;
985
+ }
986
+ 4 => {
987
+ match ( byte, safe_get ( v, i, total) ) {
988
+ ( 0xF0 , 0x90 .. 0xBF ) => ( ) ,
989
+ ( 0xF1 .. 0xF3 , 0x80 .. 0xBF ) => ( ) ,
990
+ ( 0xF4 , 0x80 .. 0x8F ) => ( ) ,
991
+ _ => {
992
+ error ! ( ) ;
993
+ continue ;
994
+ }
995
+ }
996
+ i += 1 ;
997
+ if safe_get ( v, i, total) & 192u8 != TAG_CONT_U8 {
998
+ error ! ( ) ;
999
+ continue ;
1000
+ }
1001
+ i += 1 ;
1002
+ if safe_get ( v, i, total) & 192u8 != TAG_CONT_U8 {
1003
+ error ! ( ) ;
1004
+ continue ;
1005
+ }
1006
+ i += 1 ;
1007
+ }
1008
+ _ => {
1009
+ error ! ( ) ;
1010
+ continue ;
1011
+ }
1012
+ }
1013
+ }
1014
+ }
1015
+ unsafe { raw:: push_bytes ( & mut res, v. slice ( lastgood, total) ) } ;
1016
+ res
1017
+ }
1018
+
913
1019
/// Unsafe operations
914
1020
pub mod raw {
915
1021
use cast;
@@ -2211,7 +2317,7 @@ impl<'a> StrSlice<'a> for &'a str {
2211
2317
2212
2318
// Multibyte case is a fn to allow char_range_at to inline cleanly
2213
2319
fn multibyte_char_range_at( s: & str , i: uint) -> CharRange {
2214
- let mut val = s[ i] as uint ;
2320
+ let mut val = s[ i] as u32 ;
2215
2321
let w = UTF8_CHAR_WIDTH [ val] as uint;
2216
2322
assert!( ( w != 0 ) ) ;
2217
2323
@@ -2220,7 +2326,7 @@ impl<'a> StrSlice<'a> for &'a str {
2220
2326
if w > 2 { val = utf8_acc_cont_byte!( val, s[ i + 2 ] ) ; }
2221
2327
if w > 3 { val = utf8_acc_cont_byte!( val, s[ i + 3 ] ) ; }
2222
2328
2223
- return CharRange { ch: unsafe { transmute( val as u32 ) } , next: i + w} ;
2329
+ return CharRange { ch: unsafe { transmute( val) } , next: i + w} ;
2224
2330
}
2225
2331
2226
2332
return multibyte_char_range_at( * self , i) ;
@@ -2243,7 +2349,7 @@ impl<'a> StrSlice<'a> for &'a str {
2243
2349
i -= 1 u;
2244
2350
}
2245
2351
2246
- let mut val = s[ i] as uint ;
2352
+ let mut val = s[ i] as u32 ;
2247
2353
let w = UTF8_CHAR_WIDTH [ val] as uint;
2248
2354
assert!( ( w != 0 ) ) ;
2249
2355
@@ -2252,7 +2358,7 @@ impl<'a> StrSlice<'a> for &'a str {
2252
2358
if w > 2 { val = utf8_acc_cont_byte!( val, s[ i + 2 ] ) ; }
2253
2359
if w > 3 { val = utf8_acc_cont_byte!( val, s[ i + 3 ] ) ; }
2254
2360
2255
- return CharRange { ch: unsafe { transmute( val as u32 ) } , next: i} ;
2361
+ return CharRange { ch: unsafe { transmute( val) } , next: i} ;
2256
2362
}
2257
2363
2258
2364
return multibyte_char_range_at_reverse( * self , prev) ;
@@ -3834,6 +3940,37 @@ mod tests {
3834
3940
assert_eq!(from_utf8_owned(xs), None);
3835
3941
}
3836
3942
3943
+ #[test]
3944
+ fn test_str_from_utf8_lossy() {
3945
+ let xs = bytes!(" hello");
3946
+ assert_eq!(from_utf8_lossy(xs), ~" hello");
3947
+
3948
+ let xs = bytes!(" ศไทย中华Việt Nam ");
3949
+ assert_eq!(from_utf8_lossy(xs), ~" ศไทย中华Việt Nam ");
3950
+
3951
+ let xs = bytes!(" Hello ", 0xC2, " There ", 0xFF, " Goodbye ");
3952
+ assert_eq!(from_utf8_lossy(xs), ~" Hello \uFFFD There \uFFFD Goodbye ");
3953
+
3954
+ let xs = bytes!(" Hello ", 0xC0, 0x80, " There ", 0xE6, 0x83, " Goodbye ");
3955
+ assert_eq!(from_utf8_lossy(xs), ~" Hello \uFFFD \uFFFD There \uFFFD Goodbye ");
3956
+
3957
+ let xs = bytes!(0xF5, " foo", 0xF5, 0x80, " bar");
3958
+ assert_eq!(from_utf8_lossy(xs), ~"\uFFFD foo\uFFFD \uFFFD bar" ) ;
3959
+
3960
+ let xs = bytes!( 0xF1 , "foo" , 0xF1 , 0x80 , "bar" , 0xF1 , 0x80 , 0x80 , "baz" ) ;
3961
+ assert_eq!( from_utf8_lossy( xs) , ~"\uFFFD foo\uFFFD bar\uFFFD baz");
3962
+
3963
+ let xs = bytes!(0xF4, " foo", 0xF4, 0x80, " bar", 0xF4, 0xBF, " baz");
3964
+ assert_eq!(from_utf8_lossy(xs), ~"\uFFFD foo\uFFFD bar\uFFFD \uFFFD baz" ) ;
3965
+
3966
+ let xs = bytes!( 0xF0 , 0x80 , 0x80 , 0x80 , "foo" , 0xF0 , 0x90 , 0x80 , 0x80 , "bar" ) ;
3967
+ assert_eq!( from_utf8_lossy( xs) , ~"\uFFFD \uFFFD \uFFFD \uFFFD foo\U 00010000 bar");
3968
+
3969
+ // surrogates
3970
+ let xs = bytes!(0xED, 0xA0, 0x80, " foo", 0xED, 0xBF, 0xBF, " bar");
3971
+ assert_eq!(from_utf8_lossy(xs), ~"\uFFFD \uFFFD \uFFFD foo\uFFFD \uFFFD \uFFFD bar" ) ;
3972
+ }
3973
+
3837
3974
#[ test]
3838
3975
fn test_to_send_str( ) {
3839
3976
assert_eq!( "abcde" . to_send_str( ) , SendStrStatic ( "abcde" ) ) ;
@@ -3992,6 +4129,42 @@ mod bench {
3992
4129
});
3993
4130
}
3994
4131
4132
+ #[bench]
4133
+ fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
4134
+ let s = bytes!(" Hello there, the quick brown fox jumped over the lazy dog! \
4135
+ Lorem ipsum dolor sit amet, consectetur. ");
4136
+
4137
+ assert_eq!(100, s.len());
4138
+ bh.iter(|| {
4139
+ let _ = from_utf8_lossy(s);
4140
+ });
4141
+ }
4142
+
4143
+ #[bench]
4144
+ fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
4145
+ let s = bytes!(" 𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4146
+ assert_eq!(100, s.len());
4147
+ bh.iter(|| {
4148
+ let _ = from_utf8_lossy(s);
4149
+ });
4150
+ }
4151
+
4152
+ #[bench]
4153
+ fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
4154
+ let s = bytes!(" Hello ", 0xC0, 0x80, " There ", 0xE6, 0x83, " Goodbye ");
4155
+ bh.iter(|| {
4156
+ let _ = from_utf8_lossy(s);
4157
+ });
4158
+ }
4159
+
4160
+ #[bench]
4161
+ fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
4162
+ let s = ::vec::from_elem(100, 0xF5u8);
4163
+ bh.iter(|| {
4164
+ let _ = from_utf8_lossy(s);
4165
+ });
4166
+ }
4167
+
3995
4168
#[bench]
3996
4169
fn bench_with_capacity(bh: &mut BenchHarness) {
3997
4170
bh.iter(|| {
0 commit comments