|
1 | 1 | //! impl char {}
|
2 | 2 |
|
3 | 3 | use super::*;
|
| 4 | +use crate::mem::MaybeUninit; |
4 | 5 | use crate::slice;
|
5 | 6 | use crate::str::from_utf8_unchecked_mut;
|
6 | 7 | use crate::unicode::printable::is_printable;
|
@@ -1767,33 +1768,66 @@ const fn len_utf8(code: u32) -> usize {
|
1767 | 1768 | #[inline]
|
1768 | 1769 | pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
|
1769 | 1770 | let len = len_utf8(code);
|
1770 |
| - match (len, &mut dst[..]) { |
1771 |
| - (1, [a, ..]) => { |
1772 |
| - *a = code as u8; |
1773 |
| - } |
1774 |
| - (2, [a, b, ..]) => { |
1775 |
| - *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; |
1776 |
| - *b = (code & 0x3F) as u8 | TAG_CONT; |
1777 |
| - } |
1778 |
| - (3, [a, b, c, ..]) => { |
1779 |
| - *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; |
1780 |
| - *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; |
1781 |
| - *c = (code & 0x3F) as u8 | TAG_CONT; |
1782 |
| - } |
1783 |
| - (4, [a, b, c, d, ..]) => { |
1784 |
| - *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; |
1785 |
| - *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; |
1786 |
| - *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; |
1787 |
| - *d = (code & 0x3F) as u8 | TAG_CONT; |
1788 |
| - } |
1789 |
| - _ => panic!( |
| 1771 | + if dst.len() < len { |
| 1772 | + panic!( |
1790 | 1773 | "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}",
|
1791 | 1774 | len,
|
1792 | 1775 | code,
|
1793 | 1776 | dst.len(),
|
1794 |
| - ), |
1795 |
| - }; |
1796 |
| - &mut dst[..len] |
| 1777 | + ); |
| 1778 | + } |
| 1779 | + // SAFETY: it's safe to pretend that the bytes in the slice may be uninitialized |
| 1780 | + let dst = unsafe { &mut *(dst as *mut [u8] as *mut [MaybeUninit<u8>]) }; |
| 1781 | + // SAFETY: `dst` has been checked to be long enough to hold the encoded codepoint |
| 1782 | + unsafe { encode_utf8_raw_unchecked(code, dst) } |
| 1783 | +} |
| 1784 | + |
| 1785 | +/// Encodes a raw u32 value as UTF-8 into the provided possibly uninitialized byte buffer, |
| 1786 | +/// and then returns the subslice of the buffer that contains the encoded character. |
| 1787 | +/// |
| 1788 | +/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. |
| 1789 | +/// (Creating a `char` in the surrogate range is UB.) |
| 1790 | +/// The result is valid [generalized UTF-8] but not valid UTF-8. |
| 1791 | +/// |
| 1792 | +/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 |
| 1793 | +/// |
| 1794 | +/// # Safety |
| 1795 | +/// |
| 1796 | +/// The behavior is undefined if the buffer is not large enough to hold the encoded codepoint. |
| 1797 | +/// A buffer of length four is large enough to encode any `char`. |
| 1798 | +/// |
| 1799 | +/// For a safe version of this function, see the [`encode_utf8_raw`] function. |
| 1800 | +#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] |
| 1801 | +#[doc(hidden)] |
| 1802 | +#[inline] |
| 1803 | +pub unsafe fn encode_utf8_raw_unchecked(code: u32, dst: &mut [MaybeUninit<u8>]) -> &mut [u8] { |
| 1804 | + let len = len_utf8(code); |
| 1805 | + // SAFETY: the caller must guarantee that `dst` is at least `len` bytes long |
| 1806 | + unsafe { |
| 1807 | + match len { |
| 1808 | + 1 => { |
| 1809 | + dst.get_unchecked_mut(0).write(code as u8); |
| 1810 | + } |
| 1811 | + 2 => { |
| 1812 | + dst.get_unchecked_mut(0).write((code >> 6 & 0x1F) as u8 | TAG_TWO_B); |
| 1813 | + dst.get_unchecked_mut(1).write((code & 0x3F) as u8 | TAG_CONT); |
| 1814 | + } |
| 1815 | + 3 => { |
| 1816 | + dst.get_unchecked_mut(0).write((code >> 12 & 0x0F) as u8 | TAG_THREE_B); |
| 1817 | + dst.get_unchecked_mut(1).write((code >> 6 & 0x3F) as u8 | TAG_CONT); |
| 1818 | + dst.get_unchecked_mut(2).write((code & 0x3F) as u8 | TAG_CONT); |
| 1819 | + } |
| 1820 | + 4 => { |
| 1821 | + dst.get_unchecked_mut(0).write((code >> 18 & 0x07) as u8 | TAG_FOUR_B); |
| 1822 | + dst.get_unchecked_mut(1).write((code >> 12 & 0x3F) as u8 | TAG_CONT); |
| 1823 | + dst.get_unchecked_mut(2).write((code >> 6 & 0x3F) as u8 | TAG_CONT); |
| 1824 | + dst.get_unchecked_mut(3).write((code & 0x3F) as u8 | TAG_CONT); |
| 1825 | + } |
| 1826 | + _ => unreachable!(), |
| 1827 | + } |
| 1828 | + } |
| 1829 | + // SAFETY: data has been written to the first `len` bytes |
| 1830 | + unsafe { &mut *(dst.get_unchecked_mut(..len) as *mut [MaybeUninit<u8>] as *mut [u8]) } |
1797 | 1831 | }
|
1798 | 1832 |
|
1799 | 1833 | /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
|
|
0 commit comments