Skip to content

Commit bc4ce4a

Browse files
committed
XXX: gate
1 parent 73a7193 commit bc4ce4a

File tree

4 files changed

+82
-66
lines changed

4 files changed

+82
-66
lines changed

compiler/rustc_ast/src/util/literal.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ impl LitKind {
4949

5050
// For byte/char/string literals, chars and escapes have already been
5151
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
52-
// chars and escapes are valid here.
52+
// chars and escapes are valid here, we can also ignore Rfc3349 return
53+
// values.
5354
Ok(match kind {
5455
token::Bool => {
5556
assert!(symbol.is_bool_lit());
@@ -84,7 +85,7 @@ impl LitKind {
8485
// Force-inlining here is aggressive but the closure is
8586
// called on every char in the string, so it can be hot in
8687
// programs with many long strings containing escapes.
87-
unescape_non_mixed(
88+
_ = unescape_non_mixed(
8889
s,
8990
Mode::Str,
9091
&mut #[inline(always)]
@@ -111,8 +112,7 @@ impl LitKind {
111112
// We can just use `rfc3349 = true` here, which is more
112113
// permissive than `rfc3349 = false`, because escapes and
113114
// chars were checked by the lexer.
114-
let rfc3349 = true;
115-
unescape_mixed(s, Mode::ByteStr { rfc3349 }, &mut |_, c| match c {
115+
_ = unescape_mixed(s, Mode::ByteStr { rfc3349: true }, &mut |_, c| match c {
116116
Ok(MixedUnit::Char(c)) => {
117117
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
118118
}
@@ -132,7 +132,7 @@ impl LitKind {
132132
token::CStr => {
133133
let s = symbol.as_str();
134134
let mut buf = Vec::with_capacity(s.len());
135-
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
135+
_ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
136136
Ok(MixedUnit::Char(c)) => {
137137
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
138138
}

compiler/rustc_lexer/src/unescape.rs

+53-40
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ impl EscapeError {
8888
///
8989
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
9090
/// the callback will be called exactly once.
91-
pub fn unescape_non_mixed<F>(src: &str, mode: Mode, callback: &mut F)
91+
pub fn unescape_non_mixed<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
9292
where
9393
F: FnMut(Range<usize>, Result<char, EscapeError>),
9494
{
@@ -97,6 +97,7 @@ where
9797
let mut chars = src.chars();
9898
let res = unescape_char_or_byte(&mut chars, mode);
9999
callback(0..(src.len() - chars.as_str().len()), res);
100+
Rfc3349::Unused // rfc3349 never triggered by char or byte literals
100101
}
101102
Str => unescape_non_raw_common(src, mode, callback),
102103
RawStr => check_raw_common(src, mode, callback),
@@ -107,7 +108,7 @@ where
107108
result = Err(EscapeError::NulInCStr);
108109
}
109110
callback(r, result)
110-
});
111+
})
111112
}
112113
ByteStr { .. } | CStr => unreachable!(),
113114
}
@@ -148,7 +149,7 @@ impl From<u8> for MixedUnit {
148149
/// a sequence of escaped characters or errors.
149150
///
150151
/// Values are returned by invoking `callback`.
151-
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
152+
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
152153
where
153154
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
154155
{
@@ -160,7 +161,7 @@ where
160161
result = Err(EscapeError::NulInCStr);
161162
}
162163
callback(r, result)
163-
});
164+
})
164165
}
165166
Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
166167
}
@@ -178,6 +179,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
178179
unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
179180
}
180181

182+
/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
183+
/// literal to be valid.
184+
#[derive(Debug, PartialEq)]
185+
#[must_use]
186+
pub enum Rfc3349 {
187+
Used,
188+
Unused,
189+
}
190+
181191
/// What kind of literal do we parse.
182192
#[derive(Debug, Clone, Copy, PartialEq)]
183193
pub enum Mode {
@@ -214,24 +224,24 @@ impl Mode {
214224

215225
/// Are unicode (non-ASCII) chars allowed?
216226
#[inline]
217-
fn allow_unicode_chars(self) -> bool {
227+
fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool {
218228
match self {
219-
Byte | ByteStr { rfc3349: false } | RawByteStr { rfc3349: false } => false,
229+
Byte => false,
230+
ByteStr { .. } | RawByteStr { .. } => { *rfc3349 = Rfc3349::Used; true }
220231
Char
221232
| Str
222233
| RawStr
223-
| ByteStr { rfc3349: true }
224-
| RawByteStr { rfc3349: true }
225234
| CStr
226235
| RawCStr => true,
227236
}
228237
}
229238

230239
/// Are unicode escapes (`\u`) allowed?
231-
fn allow_unicode_escapes(self) -> bool {
240+
fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool {
232241
match self {
233-
Byte | ByteStr { rfc3349: false } => false,
234-
Char | Str | ByteStr { rfc3349: true } | CStr => true,
242+
Byte => false,
243+
ByteStr { .. } => { *rfc3349 = Rfc3349::Used; true }
244+
Char | Str | CStr => true,
235245
RawByteStr { .. } | RawStr | RawCStr => unreachable!(),
236246
}
237247
}
@@ -245,9 +255,12 @@ impl Mode {
245255
}
246256
}
247257

258+
// The bool in the return value indicates if rfc3349 must be enabled for the
259+
// escape to be accepted.
248260
fn scan_escape<T: From<char> + From<u8>>(
249261
chars: &mut Chars<'_>,
250262
mode: Mode,
263+
rfc3349: &mut Rfc3349,
251264
) -> Result<T, EscapeError> {
252265
// Previous character was '\\', unescape what follows.
253266
let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
@@ -277,15 +290,17 @@ fn scan_escape<T: From<char> + From<u8>>(
277290
Ok(T::from(value as u8))
278291
};
279292
}
280-
// njn: gate: is it a ByteStr?
281-
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
293+
'u' => {
294+
// njn: convert all mode matches back to equality checks
295+
return scan_unicode(chars, mode, rfc3349).map(T::from);
296+
}
282297
_ => return Err(EscapeError::InvalidEscape),
283298
};
284299
Ok(T::from(res))
285300
}
286301

287302
// njn: change arg to mode in precursor?
288-
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
303+
fn scan_unicode(chars: &mut Chars<'_>, mode: Mode, rfc3349: &mut Rfc3349) -> Result<char, EscapeError> {
289304
// We've parsed '\u', now we have to parse '{..}'.
290305

291306
if chars.next() != Some('{') {
@@ -313,7 +328,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
313328

314329
// Incorrect syntax has higher priority for error reporting
315330
// than unallowed value for a literal.
316-
if !allow_unicode_escapes {
331+
if !mode.allow_unicode_escapes(rfc3349) {
317332
return Err(EscapeError::UnicodeEscapeInByte);
318333
}
319334

@@ -339,19 +354,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
339354
}
340355

341356
#[inline]
342-
fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
343-
if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
357+
fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result<char, EscapeError> {
358+
// Note: we must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
359+
if c.is_ascii() || mode.allow_unicode_chars(rfc3349) {
360+
Ok(c)
361+
} else {
362+
Err(EscapeError::NonAsciiCharInByte)
363+
}
344364
}
345365

346366
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
347367
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
368+
let mut rfc3349 = Rfc3349::Unused;
348369
let res = match c {
349-
'\\' => scan_escape(chars, mode),
370+
'\\' => scan_escape(chars, mode, &mut rfc3349),
350371
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
351372
'\r' => Err(EscapeError::BareCarriageReturn),
352-
// njn: this is the only ascii_check that will remain
353-
_ => ascii_check(c, mode.allow_unicode_chars()),
373+
_ => ascii_check(c, mode, &mut rfc3349),
354374
}?;
375+
376+
// rfc3349 cannot be triggered for char or byte literals.
377+
assert_eq!(rfc3349, Rfc3349::Unused);
378+
355379
if chars.next().is_some() {
356380
return Err(EscapeError::MoreThanOneChar);
357381
}
@@ -360,12 +384,12 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
360384

361385
/// Takes a contents of a string literal (without quotes) and produces a
362386
/// sequence of escaped characters or errors.
363-
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
387+
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
364388
where
365389
F: FnMut(Range<usize>, Result<T, EscapeError>),
366390
{
367391
let mut chars = src.chars();
368-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
392+
let mut rfc3349 = Rfc3349::Unused;
369393

370394
// The `start` and `end` computation here is complicated because
371395
// `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -385,20 +409,17 @@ where
385409
});
386410
continue;
387411
}
388-
_ => scan_escape::<T>(&mut chars, mode),
412+
_ => scan_escape::<T>(&mut chars, mode, &mut rfc3349),
389413
}
390414
}
391415
'"' => Err(EscapeError::EscapeOnlyChar),
392416
'\r' => Err(EscapeError::BareCarriageReturn),
393-
394-
// njn: gate, similar to check_raw_common, check:
395-
// - is it a ByteStr AND does it contain a unicode char
396-
397-
_ => ascii_check(c, allow_unicode_chars).map(T::from),
417+
_ => ascii_check(c, mode, &mut rfc3349).map(T::from),
398418
};
399419
let end = src.len() - chars.as_str().len();
400420
callback(start..end, res);
401421
}
422+
rfc3349
402423
}
403424

404425
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
@@ -431,12 +452,12 @@ where
431452
/// sequence of characters or errors.
432453
/// NOTE: Raw strings do not perform any explicit character escaping, here we
433454
/// only produce errors on bare CR.
434-
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
455+
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
435456
where
436457
F: FnMut(Range<usize>, Result<char, EscapeError>),
437458
{
438459
let mut chars = src.chars();
439-
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
460+
let mut rfc3349 = Rfc3349::Unused;
440461

441462
// The `start` and `end` computation here matches the one in
442463
// `unescape_non_raw_common` for consistency, even though this function
@@ -445,20 +466,12 @@ where
445466
let start = src.len() - chars.as_str().len() - c.len_utf8();
446467
let res = match c {
447468
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
448-
449-
// njn: gate: need to somehow return an indication of whether
450-
// rfc3349 unicode char allowance was required for this literal,
451-
// i.e. check
452-
// - is it a RawByteStr AND does it contain a unicode char
453-
//
454-
// njn: but the ascii_check itself isn't necessary
455-
// - or make it return three values? ok, ok-with-3349, bad?
456-
457-
_ => ascii_check(c, allow_unicode_chars),
469+
_ => ascii_check(c, mode, &mut rfc3349),
458470
};
459471
let end = src.len() - chars.as_str().len();
460472
callback(start..end, res);
461473
}
474+
rfc3349
462475
}
463476

464477
#[inline]

0 commit comments

Comments
 (0)