@@ -34,6 +34,97 @@ use core::ptr::addr_of;
34
34
35
35
use super :: c;
36
36
37
+ /// Creates a null-terminated UTF-16 string from a str.
38
+ macro_rules! wide_str {
39
+ ( $str: expr) => {
40
+ utf16!( concat!( $str, '\0' ) )
41
+ } ;
42
+ }
43
+
44
+ /// Creates a UTF-16 string from a str without null termination.
45
+ macro_rules! utf16 {
46
+ // Note: this macro uses triple underscores to avoid const cycles
47
+ ( $str: expr) => { {
48
+ const ___UTF8: & str = $str;
49
+ const ___UTF16_LEN: usize = crate :: sys:: pal:: windows:: api:: utf16_len( ___UTF8) ;
50
+ const ___UTF16: [ u16 ; ___UTF16_LEN] = crate :: sys:: pal:: windows:: api:: to_utf16( ___UTF8) ;
51
+ & ___UTF16
52
+ } } ;
53
+ }
54
+
55
+ /// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
56
+ pub const fn utf16_len ( s : & str ) -> usize {
57
+ let s = s. as_bytes ( ) ;
58
+ let mut i = 0 ;
59
+ let mut len = 0 ;
60
+ while i < s. len ( ) {
61
+ // the length of a UTF-8 encoded code-point is given by the number of
62
+ // leading ones, except in the case of ASCII.
63
+ let utf8_len = match s[ i] . leading_ones ( ) {
64
+ 0 => 1 ,
65
+ n => n as usize ,
66
+ } ;
67
+ i += utf8_len;
68
+ len += if utf8_len < 4 { 1 } else { 2 } ;
69
+ }
70
+ len
71
+ }
72
+
73
+ /// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
74
+ ///
75
+ /// Note that this is designed for use in const contexts so is not optimized.
76
+ pub const fn to_utf16 < const UTF16_LEN : usize > ( s : & str ) -> [ u16 ; UTF16_LEN ] {
77
+ let mut output = [ 0_u16 ; UTF16_LEN ] ;
78
+ let mut pos = 0 ;
79
+ let s = s. as_bytes ( ) ;
80
+ let mut i = 0 ;
81
+ while i < s. len ( ) {
82
+ match s[ i] . leading_ones ( ) {
83
+ // Decode UTF-8 based on its length.
84
+ // See https://en.wikipedia.org/wiki/UTF-8
85
+ 0 => {
86
+ // ASCII is the same in both encodings
87
+ output[ pos] = s[ i] as u16 ;
88
+ i += 1 ;
89
+ pos += 1 ;
90
+ }
91
+ 2 => {
92
+ // Bits: 110xxxxx 10xxxxxx
93
+ output[ pos] = ( ( s[ i] as u16 & 0b11111 ) << 6 ) | ( s[ i + 1 ] as u16 & 0b111111 ) ;
94
+ i += 2 ;
95
+ pos += 1 ;
96
+ }
97
+ 3 => {
98
+ // Bits: 1110xxxx 10xxxxxx 10xxxxxx
99
+ output[ pos] = ( ( s[ i] as u16 & 0b1111 ) << 12 )
100
+ | ( ( s[ i + 1 ] as u16 & 0b111111 ) << 6 )
101
+ | ( s[ i + 2 ] as u16 & 0b111111 ) ;
102
+ i += 3 ;
103
+ pos += 1 ;
104
+ }
105
+ 4 => {
106
+ // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
107
+ let mut c = ( ( s[ i] as u32 & 0b111 ) << 18 )
108
+ | ( ( s[ i + 1 ] as u32 & 0b111111 ) << 12 )
109
+ | ( ( s[ i + 2 ] as u32 & 0b111111 ) << 6 )
110
+ | ( s[ i + 3 ] as u32 & 0b111111 ) ;
111
+ // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
112
+ // - Subtract 0x10000 from the code point
113
+ // - For the high surrogate, shift right by 10 then add 0xD800
114
+ // - For the low surrogate, take the low 10 bits then add 0xDC00
115
+ c -= 0x10000 ;
116
+ output[ pos] = ( ( c >> 10 ) + 0xD800 ) as u16 ;
117
+ output[ pos + 1 ] = ( ( c & 0b1111111111 ) + 0xDC00 ) as u16 ;
118
+ i += 4 ;
119
+ pos += 2 ;
120
+ }
121
+ // valid UTF-8 cannot have any other values
122
+ _ => unreachable ! ( ) ,
123
+ }
124
+ }
125
+ output
126
+ }
127
+
37
128
/// Helper method for getting the size of `T` as a u32.
38
129
/// Errors at compile time if the size would overflow.
39
130
///
0 commit comments