Skip to content

Commit fe32355

Browse files
committed
Add const UTF-8 to UTF-16 conversion macros
`wide_str!` creates a null terminated UTF-16 string whereas `utf16!` just creates a UTF-16 string without adding a null.
1 parent 30840c5 commit fe32355

File tree

2 files changed

+94
-2
lines changed

2 files changed

+94
-2
lines changed

library/std/src/sys/pal/windows/api.rs

+91
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,97 @@ use core::ptr::addr_of;
3434

3535
use super::c;
3636

37+
/// Creates a null-terminated UTF-16 string from a str.
38+
macro_rules! wide_str {
39+
($str:expr) => {
40+
utf16!(concat!($str, '\0'))
41+
};
42+
}
43+
44+
/// Creates a UTF-16 string from a str without null termination.
45+
macro_rules! utf16 {
46+
// Note: this macro uses triple underscores to avoid const cycles
47+
($str:expr) => {{
48+
const ___UTF8: &str = $str;
49+
const ___UTF16_LEN: usize = crate::sys::pal::windows::api::utf16_len(___UTF8);
50+
const ___UTF16: [u16; ___UTF16_LEN] = crate::sys::pal::windows::api::to_utf16(___UTF8);
51+
&___UTF16
52+
}};
53+
}
54+
55+
/// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
56+
pub const fn utf16_len(s: &str) -> usize {
57+
let s = s.as_bytes();
58+
let mut i = 0;
59+
let mut len = 0;
60+
while i < s.len() {
61+
// the length of a UTF-8 encoded code-point is given by the number of
62+
// leading ones, except in the case of ASCII.
63+
let utf8_len = match s[i].leading_ones() {
64+
0 => 1,
65+
n => n as usize,
66+
};
67+
i += utf8_len;
68+
len += if utf8_len < 4 { 1 } else { 2 };
69+
}
70+
len
71+
}
72+
73+
/// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
74+
///
75+
/// Note that this is designed for use in const contexts so is not optimized.
76+
pub const fn to_utf16<const UTF16_LEN: usize>(s: &str) -> [u16; UTF16_LEN] {
77+
let mut output = [0_u16; UTF16_LEN];
78+
let mut pos = 0;
79+
let s = s.as_bytes();
80+
let mut i = 0;
81+
while i < s.len() {
82+
match s[i].leading_ones() {
83+
// Decode UTF-8 based on its length.
84+
// See https://en.wikipedia.org/wiki/UTF-8
85+
0 => {
86+
// ASCII is the same in both encodings
87+
output[pos] = s[i] as u16;
88+
i += 1;
89+
pos += 1;
90+
}
91+
2 => {
92+
// Bits: 110xxxxx 10xxxxxx
93+
output[pos] = ((s[i] as u16 & 0b11111) << 6) | (s[i + 1] as u16 & 0b111111);
94+
i += 2;
95+
pos += 1;
96+
}
97+
3 => {
98+
// Bits: 1110xxxx 10xxxxxx 10xxxxxx
99+
output[pos] = ((s[i] as u16 & 0b1111) << 12)
100+
| ((s[i + 1] as u16 & 0b111111) << 6)
101+
| (s[i + 2] as u16 & 0b111111);
102+
i += 3;
103+
pos += 1;
104+
}
105+
4 => {
106+
// Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
107+
let mut c = ((s[i] as u32 & 0b111) << 18)
108+
| ((s[i + 1] as u32 & 0b111111) << 12)
109+
| ((s[i + 2] as u32 & 0b111111) << 6)
110+
| (s[i + 3] as u32 & 0b111111);
111+
// re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
112+
// - Subtract 0x10000 from the code point
113+
// - For the high surrogate, shift right by 10 then add 0xD800
114+
// - For the low surrogate, take the low 10 bits then add 0xDC00
115+
c -= 0x10000;
116+
output[pos] = ((c >> 10) + 0xD800) as u16;
117+
output[pos + 1] = ((c & 0b1111111111) + 0xDC00) as u16;
118+
i += 4;
119+
pos += 2;
120+
}
121+
// valid UTF-8 cannot have any other values
122+
_ => unreachable!(),
123+
}
124+
}
125+
output
126+
}
127+
37128
/// Helper method for getting the size of `T` as a u32.
38129
/// Errors at compile time if the size would overflow.
39130
///

library/std/src/sys/pal/windows/mod.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ pub use self::rand::hashmap_random_keys;
1212
#[macro_use]
1313
pub mod compat;
1414

15+
#[macro_use]
16+
mod api;
17+
1518
pub mod alloc;
1619
pub mod args;
1720
pub mod c;
@@ -41,8 +44,6 @@ cfg_if::cfg_if! {
4144
}
4245
}
4346

44-
mod api;
45-
4647
/// Map a Result<T, WinError> to io::Result<T>.
4748
trait IoResult<T> {
4849
fn io_result(self) -> crate::io::Result<T>;

0 commit comments

Comments
 (0)