Skip to content

Commit f81aee4

Browse files
committed
Add Read::utf8_chars_lossy
1 parent fae703e commit f81aee4

File tree

1 file changed

+71
-10
lines changed

1 file changed

+71
-10
lines changed

src/libstd/io/mod.rs

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,46 @@ pub trait Read {
747747
self.utf8_chars()
748748
}
749749

750+
/// Transforms this `Read` instance to an `Iterator` over `char`s.
751+
///
752+
/// This adaptor will attempt to interpret this reader as a UTF-8 encoded
753+
/// sequence of characters. The returned iterator will return `None` once
754+
/// EOF is reached for this reader. Otherwise each element yielded will be a
755+
/// `Result<char, E>` where `E` may contain information about what I/O error
756+
/// occurred.
757+
///
758+
/// Compared to `utf8_chars`, byte sequences invalid in UTF-8 are replaced
759+
/// with U+FFFD replacement characters instead of being a variant of error.
760+
///
761+
/// # Examples
762+
///
763+
/// [`File`][file]s implement `Read`:
764+
///
765+
/// [file]: ../fs/struct.File.html
766+
///
767+
/// ```
768+
/// #![feature(io)]
769+
/// use std::io;
770+
/// use std::io::prelude::*;
771+
/// use std::fs::File;
772+
///
773+
/// # fn foo() -> io::Result<()> {
774+
/// let mut f = try!(File::open("foo.txt"));
775+
///
776+
/// for c in f.utf8_chars_lossy() {
777+
/// println!("{}", c.unwrap());
778+
/// }
779+
/// # Ok(())
780+
/// # }
781+
/// ```
782+
#[unstable(feature = "io", reason = "the semantics of a partial read/write \
783+
of where errors happen is currently \
784+
unclear and may change",
785+
issue = "27802")]
786+
fn utf8_chars_lossy(self) -> Utf8CharsLossy<Self> where Self: Sized {
787+
Utf8CharsLossy { inner: self.utf8_chars() }
788+
}
789+
750790
/// Creates an adaptor which will chain this stream with another.
751791
///
752792
/// The returned `Read` instance will first read all bytes from this object
@@ -1698,6 +1738,35 @@ impl fmt::Display for Utf8CharsError {
16981738
}
16991739
}
17001740

1741+
/// An iterator over the `char`s of a reader.
1742+
///
1743+
/// This struct is generally created by calling [`utf8_chars()`][utf8_chars] on a reader.
1744+
/// Please see the documentation of `utf8_chars()` for more details.
1745+
///
1746+
/// [utf8_chars]: trait.Read.html#method.utf8_chars
1747+
#[unstable(feature = "io", reason = "awaiting stability of Read::utf8_chars_lossy",
1748+
issue = "27802")]
1749+
pub struct Utf8CharsLossy<R> {
1750+
inner: Utf8Chars<R>,
1751+
}
1752+
1753+
#[unstable(feature = "io", reason = "awaiting stability of Read::utf8_chars",
1754+
issue = "27802")]
1755+
impl<R: Read> Iterator for Utf8CharsLossy<R> {
1756+
type Item = result::Result<char, Error>;
1757+
1758+
fn next(&mut self) -> Option<result::Result<char, Error>> {
1759+
// Follow Unicode Standard §5.22 "Best Practice for U+FFFD Substitution"
1760+
// http://www.unicode.org/versions/Unicode8.0.0/ch05.pdf#G40630
1761+
self.inner.next().map(|result| match result {
1762+
Ok(c) => Ok(c),
1763+
Err(Utf8CharsError::InvalidUtf8) |
1764+
Err(Utf8CharsError::IncompleteUtf8) => Ok('\u{FFFD}'),
1765+
Err(Utf8CharsError::Io(e)) => Err(e),
1766+
})
1767+
}
1768+
}
1769+
17011770
/// An iterator over the contents of an instance of `BufRead` split on a
17021771
/// particular byte.
17031772
///
@@ -1768,24 +1837,16 @@ mod tests {
17681837
use prelude::v1::*;
17691838
use io::prelude::*;
17701839
use io;
1771-
use super::Utf8CharsError;
17721840
use super::Cursor;
17731841
use test;
17741842
use super::repeat;
17751843

17761844
fn chars_lossy(bytes: &[u8]) -> String {
1777-
// Follow Unicode Standard §5.22 "Best Practice for U+FFFD Substitution"
1778-
// http://www.unicode.org/versions/Unicode8.0.0/ch05.pdf#G40630
1779-
Cursor::new(bytes).utf8_chars().map(|result| match result {
1780-
Ok(c) => c,
1781-
Err(Utf8CharsError::InvalidUtf8) |
1782-
Err(Utf8CharsError::IncompleteUtf8) => '\u{FFFD}',
1783-
Err(Utf8CharsError::Io(e)) => panic!("{}", e),
1784-
}).collect()
1845+
Cursor::new(bytes).utf8_chars_lossy().collect::<Result<_, _>>().unwrap()
17851846
}
17861847

17871848
#[test]
1788-
fn utf8_chars() {
1849+
fn utf8_chars_lossy() {
17891850
assert_eq!(chars_lossy(b"\xf0\x9fabc"), "�abc");
17901851
assert_eq!(chars_lossy(b"\xed\xa0\x80a"), "���a");
17911852
assert_eq!(chars_lossy(b"\xed\xa0a"), "��a");

0 commit comments

Comments
 (0)