Skip to content

Commit d90e0f3

Browse files
committed
speed up String::push and String::insert
1 parent 60d1465 commit d90e0f3

File tree

4 files changed

+96
-40
lines changed

4 files changed

+96
-40
lines changed

library/alloc/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
#![feature(async_closure)]
105105
#![feature(async_fn_traits)]
106106
#![feature(async_iterator)]
107+
#![feature(char_internals)]
107108
#![feature(clone_to_uninit)]
108109
#![feature(coerce_unsized)]
109110
#![feature(const_align_of_val)]

library/alloc/src/string.rs

+37-16
Original file line numberDiff line numberDiff line change
@@ -1353,9 +1353,14 @@ impl String {
13531353
#[inline]
13541354
#[stable(feature = "rust1", since = "1.0.0")]
13551355
pub fn push(&mut self, ch: char) {
1356-
match ch.len_utf8() {
1357-
1 => self.vec.push(ch as u8),
1358-
_ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()),
1356+
let len = self.len();
1357+
let ch_len = ch.len_utf8();
1358+
self.reserve(ch_len);
1359+
1360+
// SAFETY: just reserved capacity for at least the length needed to encode `ch`
1361+
unsafe {
1362+
core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.spare_capacity_mut());
1363+
self.vec.set_len(len + ch_len);
13591364
}
13601365
}
13611366

@@ -1651,24 +1656,34 @@ impl String {
16511656
#[rustc_confusables("set")]
16521657
pub fn insert(&mut self, idx: usize, ch: char) {
16531658
assert!(self.is_char_boundary(idx));
1654-
let mut bits = [0; 4];
1655-
let bits = ch.encode_utf8(&mut bits).as_bytes();
16561659

1660+
let len = self.len();
1661+
let ch_len = ch.len_utf8();
1662+
self.reserve(ch_len);
1663+
1664+
// SAFETY: shift data `ch_len` bytes to the right,
1665+
// capacity was just reserved for at least that many bytes
16571666
unsafe {
1658-
self.insert_bytes(idx, bits);
1667+
ptr::copy(
1668+
self.vec.as_ptr().add(idx),
1669+
self.vec.as_mut_ptr().add(idx + ch_len),
1670+
len - idx,
1671+
);
16591672
}
1660-
}
16611673

1662-
#[cfg(not(no_global_oom_handling))]
1663-
unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) {
1664-
let len = self.len();
1665-
let amt = bytes.len();
1666-
self.vec.reserve(amt);
1674+
// SAFETY: encode the character into the space left after the shift if `idx != len`,
1675+
// or into the uninitialized spare capacity otherwise
1676+
unsafe {
1677+
let dst = slice::from_raw_parts_mut(
1678+
self.vec.as_mut_ptr().add(idx) as *mut core::mem::MaybeUninit<u8>,
1679+
ch_len,
1680+
);
1681+
core::char::encode_utf8_raw_unchecked(ch as u32, dst);
1682+
}
16671683

1684+
// SAFETY: `ch_len` initialized bytes have been added
16681685
unsafe {
1669-
ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
1670-
ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
1671-
self.vec.set_len(len + amt);
1686+
self.vec.set_len(len + ch_len);
16721687
}
16731688
}
16741689

@@ -1697,8 +1712,14 @@ impl String {
16971712
pub fn insert_str(&mut self, idx: usize, string: &str) {
16981713
assert!(self.is_char_boundary(idx));
16991714

1715+
let len = self.len();
1716+
let amt = string.len();
1717+
self.reserve(amt);
1718+
17001719
unsafe {
1701-
self.insert_bytes(idx, string.as_bytes());
1720+
ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
1721+
ptr::copy_nonoverlapping(string.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
1722+
self.vec.set_len(len + amt);
17021723
}
17031724
}
17041725

library/core/src/char/methods.rs

+57-23
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
//! impl char {}
22
33
use super::*;
4+
use crate::mem::MaybeUninit;
45
use crate::slice;
56
use crate::str::from_utf8_unchecked_mut;
67
use crate::unicode::printable::is_printable;
@@ -1767,33 +1768,66 @@ const fn len_utf8(code: u32) -> usize {
17671768
#[inline]
17681769
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
17691770
let len = len_utf8(code);
1770-
match (len, &mut dst[..]) {
1771-
(1, [a, ..]) => {
1772-
*a = code as u8;
1773-
}
1774-
(2, [a, b, ..]) => {
1775-
*a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
1776-
*b = (code & 0x3F) as u8 | TAG_CONT;
1777-
}
1778-
(3, [a, b, c, ..]) => {
1779-
*a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
1780-
*b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1781-
*c = (code & 0x3F) as u8 | TAG_CONT;
1782-
}
1783-
(4, [a, b, c, d, ..]) => {
1784-
*a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
1785-
*b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
1786-
*c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
1787-
*d = (code & 0x3F) as u8 | TAG_CONT;
1788-
}
1789-
_ => panic!(
1771+
if dst.len() < len {
1772+
panic!(
17901773
"encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
17911774
len,
17921775
code,
17931776
dst.len(),
1794-
),
1795-
};
1796-
&mut dst[..len]
1777+
);
1778+
}
1779+
// SAFETY: it's safe to pretend that the bytes in the slice may be uninitialized
1780+
let dst = unsafe { &mut *(dst as *mut [u8] as *mut [MaybeUninit<u8>]) };
1781+
// SAFETY: `dst` has been checked to be long enough to hold the encoded codepoint
1782+
unsafe { encode_utf8_raw_unchecked(code, dst) }
1783+
}
1784+
1785+
/// Encodes a raw u32 value as UTF-8 into the provided possibly uninitialized byte buffer,
1786+
/// and then returns the subslice of the buffer that contains the encoded character.
1787+
///
1788+
/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1789+
/// (Creating a `char` in the surrogate range is UB.)
1790+
/// The result is valid [generalized UTF-8] but not valid UTF-8.
1791+
///
1792+
/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1793+
///
1794+
/// # Safety
1795+
///
1796+
/// The behavior is undefined if the buffer is not large enough to hold the encoded codepoint.
1797+
/// A buffer of length four is large enough to encode any `char`.
1798+
///
1799+
/// For a safe version of this function, see the [`encode_utf8_raw`] function.
1800+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1801+
#[doc(hidden)]
1802+
#[inline]
1803+
pub unsafe fn encode_utf8_raw_unchecked(code: u32, dst: &mut [MaybeUninit<u8>]) -> &mut [u8] {
1804+
let len = len_utf8(code);
1805+
// SAFETY: the caller must guarantee that `dst` is at least `len` bytes long
1806+
unsafe {
1807+
match len {
1808+
1 => {
1809+
dst.get_unchecked_mut(0).write(code as u8);
1810+
}
1811+
2 => {
1812+
dst.get_unchecked_mut(0).write((code >> 6 & 0x1F) as u8 | TAG_TWO_B);
1813+
dst.get_unchecked_mut(1).write((code & 0x3F) as u8 | TAG_CONT);
1814+
}
1815+
3 => {
1816+
dst.get_unchecked_mut(0).write((code >> 12 & 0x0F) as u8 | TAG_THREE_B);
1817+
dst.get_unchecked_mut(1).write((code >> 6 & 0x3F) as u8 | TAG_CONT);
1818+
dst.get_unchecked_mut(2).write((code & 0x3F) as u8 | TAG_CONT);
1819+
}
1820+
4 => {
1821+
dst.get_unchecked_mut(0).write((code >> 18 & 0x07) as u8 | TAG_FOUR_B);
1822+
dst.get_unchecked_mut(1).write((code >> 12 & 0x3F) as u8 | TAG_CONT);
1823+
dst.get_unchecked_mut(2).write((code >> 6 & 0x3F) as u8 | TAG_CONT);
1824+
dst.get_unchecked_mut(3).write((code & 0x3F) as u8 | TAG_CONT);
1825+
}
1826+
_ => unreachable!(),
1827+
}
1828+
}
1829+
// SAFETY: data has been written to the first `len` bytes
1830+
unsafe { &mut *(dst.get_unchecked_mut(..len) as *mut [MaybeUninit<u8>] as *mut [u8]) }
17971831
}
17981832

17991833
/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,

library/core/src/char/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pub use self::decode::{DecodeUtf16, DecodeUtf16Error};
3838
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
3939
pub use self::methods::encode_utf16_raw; // perma-unstable
4040
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
41-
pub use self::methods::encode_utf8_raw; // perma-unstable
41+
pub use self::methods::{encode_utf8_raw, encode_utf8_raw_unchecked}; // perma-unstable
4242

4343
#[rustfmt::skip]
4444
use crate::ascii;

0 commit comments

Comments
 (0)