Skip to content

Change encode_utf{8,16}() to write to a buffer and panic if it's too small #36377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/libcollections/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,7 @@ impl String {
pub fn push(&mut self, ch: char) {
match ch.len_utf8() {
1 => self.vec.push(ch as u8),
_ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()),
_ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0;4]).as_bytes()),
}
}

Expand Down Expand Up @@ -1131,10 +1131,11 @@ impl String {
let len = self.len();
assert!(idx <= len);
assert!(self.is_char_boundary(idx));
let bits = ch.encode_utf8();
let mut bits = [0; 4];
let bits = ch.encode_utf8(&mut bits).as_bytes();

unsafe {
self.insert_bytes(idx, bits.as_slice());
self.insert_bytes(idx, bits);
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/libcollectionstest/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -786,9 +786,9 @@ fn test_rev_iterator() {

#[test]
fn test_chars_decoding() {
let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) {
let bytes = c.encode_utf8();
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
let s = c.encode_utf8(&mut bytes);
if Some(c) != s.chars().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c);
}
Expand All @@ -797,9 +797,9 @@ fn test_chars_decoding() {

#[test]
fn test_chars_rev_decoding() {
let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) {
let bytes = c.encode_utf8();
let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
let s = c.encode_utf8(&mut bytes);
if Some(c) != s.chars().rev().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c);
}
Expand Down
170 changes: 51 additions & 119 deletions src/libcore/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use char_private::is_printable;
use convert::TryFrom;
use fmt;
use slice;
use iter::FusedIterator;
use mem::transmute;

Expand Down Expand Up @@ -327,9 +328,9 @@ pub trait CharExt {
#[stable(feature = "core", since = "1.6.0")]
fn len_utf16(self) -> usize;
#[unstable(feature = "unicode", issue = "27784")]
fn encode_utf8(self) -> EncodeUtf8;
fn encode_utf8(self, dst: &mut [u8]) -> &mut str;
#[unstable(feature = "unicode", issue = "27784")]
fn encode_utf16(self) -> EncodeUtf16;
fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16];
}

#[stable(feature = "core", since = "1.6.0")]
Expand Down Expand Up @@ -419,47 +420,59 @@ impl CharExt for char {
}

#[inline]
fn encode_utf8(self) -> EncodeUtf8 {
fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
let code = self as u32;
let mut buf = [0; 4];
let pos = if code < MAX_ONE_B {
buf[3] = code as u8;
3
} else if code < MAX_TWO_B {
buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
2
} else if code < MAX_THREE_B {
buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
1
} else {
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
0
};
EncodeUtf8 { buf: buf, pos: pos }
unsafe {
let len =
if code < MAX_ONE_B && !dst.is_empty() {
*dst.get_unchecked_mut(0) = code as u8;
1
} else if code < MAX_TWO_B && dst.len() >= 2 {
*dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
*dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT;
2
} else if code < MAX_THREE_B && dst.len() >= 3 {
*dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
*dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT;
3
} else if dst.len() >= 4 {
*dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
*dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
*dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
*dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT;
4
} else {
panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
from_u32_unchecked(code).len_utf8(),
code,
dst.len())
};
transmute(slice::from_raw_parts_mut(dst.as_mut_ptr(), len))
}
}

#[inline]
fn encode_utf16(self) -> EncodeUtf16 {
let mut buf = [0; 2];
fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
let mut code = self as u32;
let pos = if (code & 0xFFFF) == code {
// The BMP falls through (assuming non-surrogate, as it should)
buf[1] = code as u16;
1
} else {
// Supplementary planes break into surrogates.
code -= 0x1_0000;
buf[0] = 0xD800 | ((code >> 10) as u16);
buf[1] = 0xDC00 | ((code as u16) & 0x3FF);
0
};
EncodeUtf16 { buf: buf, pos: pos }
unsafe {
if (code & 0xFFFF) == code && !dst.is_empty() {
// The BMP falls through (assuming non-surrogate, as it should)
*dst.get_unchecked_mut(0) = code as u16;
slice::from_raw_parts_mut(dst.as_mut_ptr(), 1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
code -= 0x1_0000;
*dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16);
*dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF);
slice::from_raw_parts_mut(dst.as_mut_ptr(), 2)
} else {
panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}",
from_u32_unchecked(code).len_utf16(),
code,
dst.len())
}
}
}
}

Expand Down Expand Up @@ -702,88 +715,7 @@ impl ExactSizeIterator for EscapeDebug { }
#[unstable(feature = "fused", issue = "35602")]
impl FusedIterator for EscapeDebug {}

/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
/// value.
///
/// Constructed via the `.encode_utf8()` method on `char`.
#[unstable(feature = "unicode", issue = "27784")]
#[derive(Debug)]
pub struct EncodeUtf8 {
buf: [u8; 4],
pos: usize,
}

impl EncodeUtf8 {
/// Returns the remaining bytes of this iterator as a slice.
#[unstable(feature = "unicode", issue = "27784")]
pub fn as_slice(&self) -> &[u8] {
&self.buf[self.pos..]
}
}

#[unstable(feature = "unicode", issue = "27784")]
impl Iterator for EncodeUtf8 {
type Item = u8;

fn next(&mut self) -> Option<u8> {
if self.pos == self.buf.len() {
None
} else {
let ret = Some(self.buf[self.pos]);
self.pos += 1;
ret
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.as_slice().iter().size_hint()
}
}

#[unstable(feature = "fused", issue = "35602")]
impl FusedIterator for EncodeUtf8 {}

/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
/// value.
///
/// Constructed via the `.encode_utf16()` method on `char`.
#[unstable(feature = "unicode", issue = "27784")]
#[derive(Debug)]
pub struct EncodeUtf16 {
buf: [u16; 2],
pos: usize,
}

impl EncodeUtf16 {
/// Returns the remaining bytes of this iterator as a slice.
#[unstable(feature = "unicode", issue = "27784")]
pub fn as_slice(&self) -> &[u16] {
&self.buf[self.pos..]
}
}


#[unstable(feature = "unicode", issue = "27784")]
impl Iterator for EncodeUtf16 {
type Item = u16;

fn next(&mut self) -> Option<u16> {
if self.pos == self.buf.len() {
None
} else {
let ret = Some(self.buf[self.pos]);
self.pos += 1;
ret
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.as_slice().iter().size_hint()
}
}

#[unstable(feature = "fused", issue = "35602")]
impl FusedIterator for EncodeUtf16 {}

/// An iterator over an iterator of bytes of the characters the bytes represent
/// as UTF-8
Expand Down
18 changes: 5 additions & 13 deletions src/libcore/fmt/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,7 @@ pub trait Write {
/// This function will return an instance of `Error` on error.
#[stable(feature = "fmt_write_char", since = "1.1.0")]
fn write_char(&mut self, c: char) -> Result {
self.write_str(unsafe {
str::from_utf8_unchecked(c.encode_utf8().as_slice())
})
self.write_str(c.encode_utf8(&mut [0; 4]))
}

/// Glue for usage of the `write!` macro with implementors of this trait.
Expand Down Expand Up @@ -924,9 +922,7 @@ impl<'a> Formatter<'a> {
// Writes the sign if it exists, and then the prefix if it was requested
let write_prefix = |f: &mut Formatter| {
if let Some(c) = sign {
f.buf.write_str(unsafe {
str::from_utf8_unchecked(c.encode_utf8().as_slice())
})?;
f.buf.write_str(c.encode_utf8(&mut [0; 4]))?;
}
if prefixed { f.buf.write_str(prefix) }
else { Ok(()) }
Expand Down Expand Up @@ -1032,10 +1028,8 @@ impl<'a> Formatter<'a> {
rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
};

let fill = self.fill.encode_utf8();
let fill = unsafe {
str::from_utf8_unchecked(fill.as_slice())
};
let mut fill = [0; 4];
let fill = self.fill.encode_utf8(&mut fill);

for _ in 0..pre_pad {
self.buf.write_str(fill)?;
Expand Down Expand Up @@ -1435,9 +1429,7 @@ impl Display for char {
if f.width.is_none() && f.precision.is_none() {
f.write_char(*self)
} else {
f.pad(unsafe {
str::from_utf8_unchecked(self.encode_utf8().as_slice())
})
f.pad(self.encode_utf8(&mut [0; 4]))
}
}
}
Expand Down
21 changes: 12 additions & 9 deletions src/libcoretest/char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::char;
use std::{char,str};
use std::convert::TryFrom;

#[test]
Expand Down Expand Up @@ -248,10 +248,12 @@ fn test_escape_unicode() {
#[test]
fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) {
assert_eq!(input.encode_utf8().as_slice(), expect);
for (a, b) in input.encode_utf8().zip(expect) {
assert_eq!(a, *b);
}
let mut buf = [0; 4];
let ptr = buf.as_ptr();
let s = input.encode_utf8(&mut buf);
assert_eq!(s.as_ptr() as usize, ptr as usize);
assert!(str::from_utf8(s.as_bytes()).is_ok());
assert_eq!(s.as_bytes(), expect);
}

check('x', &[0x78]);
Expand All @@ -263,10 +265,11 @@ fn test_encode_utf8() {
#[test]
fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) {
assert_eq!(input.encode_utf16().as_slice(), expect);
for (a, b) in input.encode_utf16().zip(expect) {
assert_eq!(a, *b);
}
let mut buf = [0; 2];
let ptr = buf.as_mut_ptr();
let b = input.encode_utf16(&mut buf);
assert_eq!(b.as_mut_ptr() as usize, ptr as usize);
assert_eq!(b, expect);
}

check('x', &[0x0078]);
Expand Down
Loading