Add new function str::from_utf8_lossy()

lilyball · lilyball · commit b0b89a57d5d5 · 2014-02-06T23:44:26.000-08:00
from_utf8_lossy() takes a byte vector and produces a ~str, converting any invalid UTF-8 sequence into the U+FFFD REPLACEMENT CHARACTER. The replacement follows the guidelines in §5.22 Best Practice for U+FFFD Substitution from the Unicode Standard (Version 6.2)[1], which also matches the WHATWG rules for utf-8 decoding[2]. [1]: http://www.unicode.org/versions/Unicode6.2.0/ch05.pdf [2]: http://encoding.spec.whatwg.org/#utf-8
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
@@ -900,16 +900,122 @@ pub struct CharRange {
 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
 // for width 3, and 3 bits for width 4
 macro_rules! utf8_first_byte(
-    ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
+    ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
 )
 
 // return the value of $ch updated with continuation byte $byte
 macro_rules! utf8_acc_cont_byte(
-    ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
+    ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
 )
 
 static TAG_CONT_U8: u8 = 128u8;
 
+/// Converts a vector of bytes to a new utf-8 string.
+/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
+///
+/// # Example
+///
+/// ```rust
+/// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
+/// let output = std::str::from_utf8_lossy(input);
+/// assert_eq!(output, ~"Hello \uFFFDWorld");
+/// ```
+pub fn from_utf8_lossy(v: &[u8]) -> ~str {
+    static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
+    let mut i = 0u;
+    let mut lastgood = 0u;
+    let total = v.len();
+    fn unsafe_get(xs: &[u8], i: uint) -> u8 {
+        unsafe { *xs.unsafe_ref(i) }
+    }
+    fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
+        if i >= total {
+            0
+        } else {
+            unsafe_get(xs, i)
+        }
+    }
+    let mut res = with_capacity(total);
+
+    while i < total {
+        let i_ = i;
+        let byte = unsafe_get(v, i);
+        i += 1;
+
+        macro_rules! error(() => {
+            unsafe {
+                if lastgood != i_ {
+                    raw::push_bytes(&mut res, v.slice(lastgood, i_));
+                }
+                lastgood = i;
+                raw::push_bytes(&mut res, REPLACEMENT);
+            }
+        })
+
+        if byte < 128u8 {
+            // lastgood handles this
+        } else {
+            let w = utf8_char_width(byte);
+
+            match w {
+                2 => {
+                    if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
+                        error!();
+                        continue;
+                    }
+                    i += 1;
+                }
+                3 => {
+                    match (byte, safe_get(v, i, total)) {
+                        (0xE0        , 0xA0 .. 0xBF) => (),
+                        (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
+                        (0xED        , 0x80 .. 0x9F) => (),
+                        (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
+                        _ => {
+                            error!();
+                            continue;
+                        }
+                    }
+                    i += 1;
+                    if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
+                        error!();
+                        continue;
+                    }
+                    i += 1;
+                }
+                4 => {
+                    match (byte, safe_get(v, i, total)) {
+                        (0xF0        , 0x90 .. 0xBF) => (),
+                        (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
+                        (0xF4        , 0x80 .. 0x8F) => (),
+                        _ => {
+                            error!();
+                            continue;
+                        }
+                    }
+                    i += 1;
+                    if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
+                        error!();
+                        continue;
+                    }
+                    i += 1;
+                    if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
+                        error!();
+                        continue;
+                    }
+                    i += 1;
+                }
+                _ => {
+                    error!();
+                    continue;
+                }
+            }
+        }
+    }
+    unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) };
+    res
+}
+
 /// Unsafe operations
 pub mod raw {
     use cast;
@@ -2211,7 +2317,7 @@ impl<'a> StrSlice<'a> for &'a str {
 
         // Multibyte case is a fn to allow char_range_at to inline cleanly
         fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
-            let mut val = s[i] as uint;
+            let mut val = s[i] as u32;
             let w = UTF8_CHAR_WIDTH[val] as uint;
             assert!((w != 0));
 
@@ -2220,7 +2326,7 @@ impl<'a> StrSlice<'a> for &'a str {
             if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
             if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
 
-            return CharRange {ch: unsafe { transmute(val as u32) }, next: i + w};
+            return CharRange {ch: unsafe { transmute(val) }, next: i + w};
         }
 
         return multibyte_char_range_at(*self, i);
@@ -2243,7 +2349,7 @@ impl<'a> StrSlice<'a> for &'a str {
                 i -= 1u;
             }
 
-            let mut val = s[i] as uint;
+            let mut val = s[i] as u32;
             let w = UTF8_CHAR_WIDTH[val] as uint;
             assert!((w != 0));
 
@@ -2252,7 +2358,7 @@ impl<'a> StrSlice<'a> for &'a str {
             if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
             if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
 
-            return CharRange {ch: unsafe { transmute(val as u32) }, next: i};
+            return CharRange {ch: unsafe { transmute(val) }, next: i};
         }
 
         return multibyte_char_range_at_reverse(*self, prev);
@@ -3834,6 +3940,37 @@ mod tests {
         assert_eq!(from_utf8_owned(xs), None);
     }
 
+    #[test]
+    fn test_str_from_utf8_lossy() {
+        let xs = bytes!("hello");
+        assert_eq!(from_utf8_lossy(xs), ~"hello");
+
+        let xs = bytes!("ศไทย中华Việt Nam");
+        assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam");
+
+        let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
+        assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye");
+
+        let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
+        assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye");
+
+        let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
+        assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar");
+
+        let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
+        assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz");
+
+        let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
+        assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz");
+
+        let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
+        assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar");
+
+        // surrogates
+        let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
+        assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar");
+    }
+
     #[test]
     fn test_to_send_str() {
         assert_eq!("abcde".to_send_str(), SendStrStatic("abcde"));
@@ -3992,6 +4129,42 @@ mod bench {
         });
     }
 
+    #[bench]
+    fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
+        let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
+                        Lorem ipsum dolor sit amet, consectetur. ");
+
+        assert_eq!(100, s.len());
+        bh.iter(|| {
+            let _ = from_utf8_lossy(s);
+        });
+    }
+
+    #[bench]
+    fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
+        let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
+        assert_eq!(100, s.len());
+        bh.iter(|| {
+            let _ = from_utf8_lossy(s);
+        });
+    }
+
+    #[bench]
+    fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
+        let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
+        bh.iter(|| {
+            let _ = from_utf8_lossy(s);
+        });
+    }
+
+    #[bench]
+    fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
+        let s = ::vec::from_elem(100, 0xF5u8);
+        bh.iter(|| {
+            let _ = from_utf8_lossy(s);
+        });
+    }
+
     #[bench]
     fn bench_with_capacity(bh: &mut BenchHarness) {
         bh.iter(|| {