Skip to content

Commit b0b89a5

Browse files
committed
Add new function str::from_utf8_lossy()
from_utf8_lossy() takes a byte vector and produces a ~str, converting any invalid UTF-8 sequence into the U+FFFD REPLACEMENT CHARACTER. The replacement follows the guidelines in §5.22 Best Practice for U+FFFD Substitution from the Unicode Standard (Version 6.2)[1], which also matches the WHATWG rules for utf-8 decoding[2]. [1]: http://www.unicode.org/versions/Unicode6.2.0/ch05.pdf [2]: http://encoding.spec.whatwg.org/#utf-8
1 parent 6aad3bf commit b0b89a5

File tree

1 file changed

+179
-6
lines changed

1 file changed

+179
-6
lines changed

src/libstd/str.rs

Lines changed: 179 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -900,16 +900,122 @@ pub struct CharRange {
900900
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
901901
// for width 3, and 3 bits for width 4
902902
macro_rules! utf8_first_byte(
903-
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
903+
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
904904
)
905905

906906
// return the value of $ch updated with continuation byte $byte
907907
macro_rules! utf8_acc_cont_byte(
908-
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
908+
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
909909
)
910910

911911
static TAG_CONT_U8: u8 = 128u8;
912912

913+
/// Converts a vector of bytes to a new utf-8 string.
914+
/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
915+
///
916+
/// # Example
917+
///
918+
/// ```rust
919+
/// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
920+
/// let output = std::str::from_utf8_lossy(input);
921+
/// assert_eq!(output, ~"Hello \uFFFDWorld");
922+
/// ```
923+
pub fn from_utf8_lossy(v: &[u8]) -> ~str {
924+
static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
925+
let mut i = 0u;
926+
let mut lastgood = 0u;
927+
let total = v.len();
928+
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
929+
unsafe { *xs.unsafe_ref(i) }
930+
}
931+
fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
932+
if i >= total {
933+
0
934+
} else {
935+
unsafe_get(xs, i)
936+
}
937+
}
938+
let mut res = with_capacity(total);
939+
940+
while i < total {
941+
let i_ = i;
942+
let byte = unsafe_get(v, i);
943+
i += 1;
944+
945+
macro_rules! error(() => {
946+
unsafe {
947+
if lastgood != i_ {
948+
raw::push_bytes(&mut res, v.slice(lastgood, i_));
949+
}
950+
lastgood = i;
951+
raw::push_bytes(&mut res, REPLACEMENT);
952+
}
953+
})
954+
955+
if byte < 128u8 {
956+
// lastgood handles this
957+
} else {
958+
let w = utf8_char_width(byte);
959+
960+
match w {
961+
2 => {
962+
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
963+
error!();
964+
continue;
965+
}
966+
i += 1;
967+
}
968+
3 => {
969+
match (byte, safe_get(v, i, total)) {
970+
(0xE0 , 0xA0 .. 0xBF) => (),
971+
(0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
972+
(0xED , 0x80 .. 0x9F) => (),
973+
(0xEE .. 0xEF, 0x80 .. 0xBF) => (),
974+
_ => {
975+
error!();
976+
continue;
977+
}
978+
}
979+
i += 1;
980+
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
981+
error!();
982+
continue;
983+
}
984+
i += 1;
985+
}
986+
4 => {
987+
match (byte, safe_get(v, i, total)) {
988+
(0xF0 , 0x90 .. 0xBF) => (),
989+
(0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
990+
(0xF4 , 0x80 .. 0x8F) => (),
991+
_ => {
992+
error!();
993+
continue;
994+
}
995+
}
996+
i += 1;
997+
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
998+
error!();
999+
continue;
1000+
}
1001+
i += 1;
1002+
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1003+
error!();
1004+
continue;
1005+
}
1006+
i += 1;
1007+
}
1008+
_ => {
1009+
error!();
1010+
continue;
1011+
}
1012+
}
1013+
}
1014+
}
1015+
unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) };
1016+
res
1017+
}
1018+
9131019
/// Unsafe operations
9141020
pub mod raw {
9151021
use cast;
@@ -2211,7 +2317,7 @@ impl<'a> StrSlice<'a> for &'a str {
22112317

22122318
// Multibyte case is a fn to allow char_range_at to inline cleanly
22132319
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2214-
let mut val = s[i] as uint;
2320+
let mut val = s[i] as u32;
22152321
let w = UTF8_CHAR_WIDTH[val] as uint;
22162322
assert!((w != 0));
22172323

@@ -2220,7 +2326,7 @@ impl<'a> StrSlice<'a> for &'a str {
22202326
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
22212327
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
22222328

2223-
return CharRange {ch: unsafe { transmute(val as u32) }, next: i + w};
2329+
return CharRange {ch: unsafe { transmute(val) }, next: i + w};
22242330
}
22252331

22262332
return multibyte_char_range_at(*self, i);
@@ -2243,7 +2349,7 @@ impl<'a> StrSlice<'a> for &'a str {
22432349
i -= 1u;
22442350
}
22452351

2246-
let mut val = s[i] as uint;
2352+
let mut val = s[i] as u32;
22472353
let w = UTF8_CHAR_WIDTH[val] as uint;
22482354
assert!((w != 0));
22492355

@@ -2252,7 +2358,7 @@ impl<'a> StrSlice<'a> for &'a str {
22522358
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
22532359
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
22542360

2255-
return CharRange {ch: unsafe { transmute(val as u32) }, next: i};
2361+
return CharRange {ch: unsafe { transmute(val) }, next: i};
22562362
}
22572363

22582364
return multibyte_char_range_at_reverse(*self, prev);
@@ -3834,6 +3940,37 @@ mod tests {
38343940
assert_eq!(from_utf8_owned(xs), None);
38353941
}
38363942
3943+
#[test]
3944+
fn test_str_from_utf8_lossy() {
3945+
let xs = bytes!("hello");
3946+
assert_eq!(from_utf8_lossy(xs), ~"hello");
3947+
3948+
let xs = bytes!("ศไทย中华Việt Nam");
3949+
assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam");
3950+
3951+
let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
3952+
assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye");
3953+
3954+
let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
3955+
assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye");
3956+
3957+
let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
3958+
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar");
3959+
3960+
let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
3961+
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz");
3962+
3963+
let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
3964+
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz");
3965+
3966+
let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
3967+
assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar");
3968+
3969+
// surrogates
3970+
let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
3971+
assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar");
3972+
}
3973+
38373974
#[test]
38383975
fn test_to_send_str() {
38393976
assert_eq!("abcde".to_send_str(), SendStrStatic("abcde"));
@@ -3992,6 +4129,42 @@ mod bench {
39924129
});
39934130
}
39944131
4132+
#[bench]
4133+
fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
4134+
let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4135+
Lorem ipsum dolor sit amet, consectetur. ");
4136+
4137+
assert_eq!(100, s.len());
4138+
bh.iter(|| {
4139+
let _ = from_utf8_lossy(s);
4140+
});
4141+
}
4142+
4143+
#[bench]
4144+
fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
4145+
let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4146+
assert_eq!(100, s.len());
4147+
bh.iter(|| {
4148+
let _ = from_utf8_lossy(s);
4149+
});
4150+
}
4151+
4152+
#[bench]
4153+
fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
4154+
let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4155+
bh.iter(|| {
4156+
let _ = from_utf8_lossy(s);
4157+
});
4158+
}
4159+
4160+
#[bench]
4161+
fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
4162+
let s = ::vec::from_elem(100, 0xF5u8);
4163+
bh.iter(|| {
4164+
let _ = from_utf8_lossy(s);
4165+
});
4166+
}
4167+
39954168
#[bench]
39964169
fn bench_with_capacity(bh: &mut BenchHarness) {
39974170
bh.iter(|| {

0 commit comments

Comments
 (0)