Skip to content

Commit b626f8d

Browse files
committed
Auto merge of #120286 - nnethercote:3349-mixed-utf8-literals, r=<try>
Implement RFC 3349, mixed utf8 literals RFC: rust-lang/rfcs#3349 Tracking issue: #116907 r? `@ghost`
2 parents 68411c9 + 585f313 commit b626f8d

32 files changed

+474
-498
lines changed

compiler/rustc_ast/src/util/literal.rs

+32-70
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
44
use crate::token::{self, Token};
55
use rustc_lexer::unescape::{
6-
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
7-
Mode,
6+
unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
87
};
98
use rustc_span::symbol::{kw, sym, Symbol};
109
use rustc_span::Span;
@@ -48,6 +47,10 @@ impl LitKind {
4847
return Err(LitError::InvalidSuffix);
4948
}
5049

50+
// For byte/char/string literals, chars and escapes have already been
51+
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
52+
// chars and escapes are valid here, and ignore `Rfc3349` return
53+
// values.
5154
Ok(match kind {
5255
token::Bool => {
5356
assert!(symbol.is_bool_lit());
@@ -56,12 +59,12 @@ impl LitKind {
5659
token::Byte => {
5760
return unescape_byte(symbol.as_str())
5861
.map(LitKind::Byte)
59-
.map_err(|_| LitError::LexerError);
62+
.map_err(|_| panic!("failed to unescape byte literal"));
6063
}
6164
token::Char => {
6265
return unescape_char(symbol.as_str())
6366
.map(LitKind::Char)
64-
.map_err(|_| LitError::LexerError);
67+
.map_err(|_| panic!("failed to unescape char literal"));
6568
}
6669

6770
// There are some valid suffixes for integer and float literals,
@@ -77,113 +80,72 @@ impl LitKind {
7780
let s = symbol.as_str();
7881
// Vanilla strings are so common we optimize for the common case where no chars
7982
// requiring special behaviour are present.
80-
let symbol = if s.contains(['\\', '\r']) {
83+
let symbol = if s.contains('\\') {
8184
let mut buf = String::with_capacity(s.len());
82-
let mut error = Ok(());
8385
// Force-inlining here is aggressive but the closure is
84-
// called on every char in the string, so it can be
85-
// hot in programs with many long strings.
86-
unescape_literal(
86+
// called on every char in the string, so it can be hot in
87+
// programs with many long strings containing escapes.
88+
_ = unescape_unicode(
8789
s,
8890
Mode::Str,
8991
&mut #[inline(always)]
90-
|_, unescaped_char| match unescaped_char {
92+
|_, c| match c {
9193
Ok(c) => buf.push(c),
9294
Err(err) => {
93-
if err.is_fatal() {
94-
error = Err(LitError::LexerError);
95-
}
95+
assert!(!err.is_fatal(), "failed to unescape string literal")
9696
}
9797
},
9898
);
99-
error?;
10099
Symbol::intern(&buf)
101100
} else {
102101
symbol
103102
};
104103
LitKind::Str(symbol, ast::StrStyle::Cooked)
105104
}
106105
token::StrRaw(n) => {
107-
// Raw strings have no escapes, so we only need to check for invalid chars, and we
108-
// can reuse the symbol on success.
109-
let mut error = Ok(());
110-
unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
111-
match unescaped_char {
112-
Ok(_) => {}
113-
Err(err) => {
114-
if err.is_fatal() {
115-
error = Err(LitError::LexerError);
116-
}
117-
}
118-
}
119-
});
120-
error?;
106+
// Raw strings have no escapes so no work is needed here.
121107
LitKind::Str(symbol, ast::StrStyle::Raw(n))
122108
}
123109
token::ByteStr => {
124110
let s = symbol.as_str();
125111
let mut buf = Vec::with_capacity(s.len());
126-
let mut error = Ok(());
127-
unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
128-
Ok(c) => buf.push(byte_from_char(c)),
112+
_ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c {
113+
Ok(MixedUnit::Char(c)) => {
114+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
115+
}
116+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
129117
Err(err) => {
130-
if err.is_fatal() {
131-
error = Err(LitError::LexerError);
132-
}
118+
assert!(!err.is_fatal(), "failed to unescape string literal")
133119
}
134120
});
135-
error?;
136121
LitKind::ByteStr(buf.into(), StrStyle::Cooked)
137122
}
138123
token::ByteStrRaw(n) => {
139-
// Raw strings have no escapes, so we only need to check for invalid chars, and we
140-
// can convert the symbol directly to a `Lrc<u8>` on success.
141-
let s = symbol.as_str();
142-
let mut error = Ok(());
143-
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
144-
Ok(_) => {}
145-
Err(err) => {
146-
if err.is_fatal() {
147-
error = Err(LitError::LexerError);
148-
}
149-
}
150-
});
151-
LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
124+
// Raw strings have no escapes so we can convert the symbol
125+
// directly to a `Lrc<u8>`.
126+
let buf = symbol.as_str().to_owned().into_bytes();
127+
LitKind::ByteStr(buf.into(), StrStyle::Raw(n))
152128
}
153129
token::CStr => {
154130
let s = symbol.as_str();
155131
let mut buf = Vec::with_capacity(s.len());
156-
let mut error = Ok(());
157-
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
158-
Ok(CStrUnit::Byte(b)) => buf.push(b),
159-
Ok(CStrUnit::Char(c)) => {
132+
_ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
133+
Ok(MixedUnit::Char(c)) => {
160134
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
161135
}
136+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
162137
Err(err) => {
163-
if err.is_fatal() {
164-
error = Err(LitError::LexerError);
165-
}
138+
assert!(!err.is_fatal(), "failed to unescape C string literal")
166139
}
167140
});
168-
error?;
169141
buf.push(0);
170142
LitKind::CStr(buf.into(), StrStyle::Cooked)
171143
}
172144
token::CStrRaw(n) => {
173-
// Raw strings have no escapes, so we only need to check for invalid chars, and we
174-
// can convert the symbol directly to a `Lrc<u8>` on success.
175-
let s = symbol.as_str();
176-
let mut error = Ok(());
177-
unescape_c_string(s, Mode::RawCStr, &mut |_, c| match c {
178-
Ok(_) => {}
179-
Err(err) => {
180-
if err.is_fatal() {
181-
error = Err(LitError::LexerError);
182-
}
183-
}
184-
});
185-
error?;
186-
let mut buf = s.to_owned().into_bytes();
145+
// Raw strings have no escapes so we can convert the symbol
146+
// directly to a `Lrc<u8>` after appending the terminating NUL
147+
// char.
148+
let mut buf = symbol.as_str().to_owned().into_bytes();
187149
buf.push(0);
188150
LitKind::CStr(buf.into(), StrStyle::Raw(n))
189151
}

compiler/rustc_ast_passes/src/feature_gate.rs

+1
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
508508
}
509509
};
510510
}
511+
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#);
511512
gate_all!(
512513
if_let_guard,
513514
"`if let` guards are experimental",

compiler/rustc_feature/src/unstable.rs

+2
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,8 @@ declare_features! (
520520
/// standard library until the soundness issues with specialization
521521
/// are fixed.
522522
(unstable, min_specialization, "1.7.0", Some(31844)),
523+
/// Allows mixed utf8 b"" and br"" literals.
524+
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
523525
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
524526
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
525527
/// Allows the `#[must_not_suspend]` attribute.

0 commit comments

Comments
 (0)