Skip to content

Commit bfa5f27

Browse files
committed
introduce unescape module
Currently, we deal with escape sequences twice: once when we lex a string, and a second time when we unescape literals. This PR aims to remove this duplication, by introducing a new `unescape` mode as a single source of truth for character escaping rules
1 parent 9b67bd4 commit bfa5f27

24 files changed

+1047
-769
lines changed

src/librustc_errors/diagnostic_builder.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ impl<'a> DiagnosticBuilder<'a> {
184184
) -> &mut Self);
185185
forward!(pub fn warn(&mut self, msg: &str) -> &mut Self);
186186
forward!(pub fn span_warn<S: Into<MultiSpan>>(&mut self, sp: S, msg: &str) -> &mut Self);
187-
forward!(pub fn help(&mut self , msg: &str) -> &mut Self);
187+
forward!(pub fn help(&mut self, msg: &str) -> &mut Self);
188188
forward!(pub fn span_help<S: Into<MultiSpan>>(&mut self,
189189
sp: S,
190190
msg: &str,

src/libsyntax/parse/lexer/mod.rs

+142-443
Large diffs are not rendered by default.

src/libsyntax/parse/mod.rs

+40-222
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ use log::debug;
1818

1919
use rustc_data_structures::fx::FxHashSet;
2020
use std::borrow::Cow;
21-
use std::iter;
2221
use std::path::{Path, PathBuf};
2322
use std::str;
2423

@@ -33,6 +32,11 @@ pub mod attr;
3332

3433
pub mod classify;
3534

35+
pub(crate) mod unescape;
36+
use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError};
37+
38+
pub(crate) mod unescape_error_reporting;
39+
3640
/// Info about a parsing session.
3741
pub struct ParseSess {
3842
pub span_diagnostic: Handler,
@@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
306310
Parser::new(sess, stream, None, true, false)
307311
}
308312

309-
/// Parses a string representing a character literal into its final form.
310-
/// Rather than just accepting/rejecting a given literal, unescapes it as
311-
/// well. Can take any slice prefixed by a character escape. Returns the
312-
/// character and the number of characters consumed.
313-
fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
314-
use std::char;
315-
316-
// Handle non-escaped chars first.
317-
if lit.as_bytes()[0] != b'\\' {
318-
// If the first byte isn't '\\' it might part of a multi-byte char, so
319-
// get the char with chars().
320-
let c = lit.chars().next().unwrap();
321-
return (c, 1);
322-
}
323-
324-
// Handle escaped chars.
325-
match lit.as_bytes()[1] as char {
326-
'"' => ('"', 2),
327-
'n' => ('\n', 2),
328-
'r' => ('\r', 2),
329-
't' => ('\t', 2),
330-
'\\' => ('\\', 2),
331-
'\'' => ('\'', 2),
332-
'0' => ('\0', 2),
333-
'x' => {
334-
let v = u32::from_str_radix(&lit[2..4], 16).unwrap();
335-
let c = char::from_u32(v).unwrap();
336-
(c, 4)
337-
}
338-
'u' => {
339-
assert_eq!(lit.as_bytes()[2], b'{');
340-
let idx = lit.find('}').unwrap();
341-
342-
// All digits and '_' are ascii, so treat each byte as a char.
343-
let mut v: u32 = 0;
344-
for c in lit[3..idx].bytes() {
345-
let c = char::from(c);
346-
if c != '_' {
347-
let x = c.to_digit(16).unwrap();
348-
v = v.checked_mul(16).unwrap().checked_add(x).unwrap();
349-
}
350-
}
351-
let c = char::from_u32(v).unwrap_or_else(|| {
352-
if let Some((span, diag)) = diag {
353-
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
354-
if v > 0x10FFFF {
355-
diag.help("unicode escape must be at most 10FFFF").emit();
356-
} else {
357-
diag.help("unicode escape must not be a surrogate").emit();
358-
}
359-
}
360-
'\u{FFFD}'
361-
});
362-
(c, (idx + 1) as isize)
363-
}
364-
_ => panic!("lexer should have rejected a bad character escape {}", lit)
365-
}
366-
}
367-
368-
/// Parses a string representing a string literal into its final form. Does unescaping.
369-
fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
370-
debug!("str_lit: given {}", lit.escape_default());
371-
let mut res = String::with_capacity(lit.len());
372-
373-
let error = |i| format!("lexer should have rejected {} at {}", lit, i);
374-
375-
/// Eat everything up to a non-whitespace.
376-
fn eat<'a>(it: &mut iter::Peekable<str::CharIndices<'a>>) {
377-
loop {
378-
match it.peek().map(|x| x.1) {
379-
Some(' ') | Some('\n') | Some('\r') | Some('\t') => {
380-
it.next();
381-
},
382-
_ => { break; }
383-
}
384-
}
385-
}
386-
387-
let mut chars = lit.char_indices().peekable();
388-
while let Some((i, c)) = chars.next() {
389-
match c {
390-
'\\' => {
391-
let ch = chars.peek().unwrap_or_else(|| {
392-
panic!("{}", error(i))
393-
}).1;
394-
395-
if ch == '\n' {
396-
eat(&mut chars);
397-
} else if ch == '\r' {
398-
chars.next();
399-
let ch = chars.peek().unwrap_or_else(|| {
400-
panic!("{}", error(i))
401-
}).1;
402-
403-
if ch != '\n' {
404-
panic!("lexer accepted bare CR");
405-
}
406-
eat(&mut chars);
407-
} else {
408-
// otherwise, a normal escape
409-
let (c, n) = char_lit(&lit[i..], diag);
410-
for _ in 0..n - 1 { // we don't need to move past the first \
411-
chars.next();
412-
}
413-
res.push(c);
414-
}
415-
},
416-
'\r' => {
417-
let ch = chars.peek().unwrap_or_else(|| {
418-
panic!("{}", error(i))
419-
}).1;
420-
421-
if ch != '\n' {
422-
panic!("lexer accepted bare CR");
423-
}
424-
chars.next();
425-
res.push('\n');
426-
}
427-
c => res.push(c),
428-
}
429-
}
430-
431-
res.shrink_to_fit(); // probably not going to do anything, unless there was an escape.
432-
debug!("parse_str_lit: returning {}", res);
433-
res
434-
}
435-
436313
/// Parses a string representing a raw string literal into its final form. The
437314
/// only operation this does is convert embedded CRLF into a single LF.
438315
fn raw_str_lit(lit: &str) -> String {
@@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
475352
use ast::LitKind;
476353

477354
match lit {
478-
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
479-
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
480-
token::Err(i) => (true, Some(LitKind::Err(i))),
355+
token::Byte(i) => {
356+
let lit_kind = match unescape_byte(&i.as_str()) {
357+
Ok(c) => LitKind::Byte(c),
358+
Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i),
359+
Err(_) => LitKind::Byte(0),
360+
};
361+
(true, Some(lit_kind))
362+
},
363+
token::Char(i) => {
364+
let lit_kind = match unescape_char(&i.as_str()) {
365+
Ok(c) => LitKind::Char(c),
366+
Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i),
367+
Err(_) => LitKind::Char('\u{FFFD}'),
368+
};
369+
(true, Some(lit_kind))
370+
},
371+
token::Err(i) => (true, Some(LitKind::Err(i))),
481372

482373
// There are some valid suffixes for integer and float literals,
483374
// so all the handling is done internally.
@@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
491382
// string in the Token.
492383
let s = &sym.as_str();
493384
if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') {
494-
sym = Symbol::intern(&str_lit(s, diag));
385+
let mut buf = String::with_capacity(s.len());
386+
unescape_str(s, &mut |_, unescaped_char| {
387+
match unescaped_char {
388+
Ok(c) => buf.push(c),
389+
Err(_) => buf.push('\u{FFFD}'),
390+
}
391+
});
392+
sym = Symbol::intern(&buf)
495393
}
496394
(true, Some(LitKind::Str(sym, ast::StrStyle::Cooked)))
497395
}
@@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
504402
(true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n))))
505403
}
506404
token::ByteStr(i) => {
507-
(true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str()))))
405+
let s = &i.as_str();
406+
let mut buf = Vec::with_capacity(s.len());
407+
unescape_byte_str(s, &mut |_, unescaped_byte| {
408+
match unescaped_byte {
409+
Ok(c) => buf.push(c),
410+
Err(_) => buf.push(0),
411+
}
412+
});
413+
buf.shrink_to_fit();
414+
(true, Some(LitKind::ByteStr(Lrc::new(buf))))
508415
}
509416
token::ByteStrRaw(i, _) => {
510417
(true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes()))))
@@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
559466
filtered_float_lit(Symbol::intern(s), suffix, diag)
560467
}
561468

562-
/// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
563-
fn byte_lit(lit: &str) -> (u8, usize) {
564-
let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i);
565-
566-
if lit.len() == 1 {
567-
(lit.as_bytes()[0], 1)
568-
} else {
569-
assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0));
570-
let b = match lit.as_bytes()[1] {
571-
b'"' => b'"',
572-
b'n' => b'\n',
573-
b'r' => b'\r',
574-
b't' => b'\t',
575-
b'\\' => b'\\',
576-
b'\'' => b'\'',
577-
b'0' => b'\0',
578-
_ => {
579-
match u64::from_str_radix(&lit[2..4], 16).ok() {
580-
Some(c) =>
581-
if c > 0xFF {
582-
panic!(err(2))
583-
} else {
584-
return (c as u8, 4)
585-
},
586-
None => panic!(err(3))
587-
}
588-
}
589-
};
590-
(b, 2)
591-
}
592-
}
593-
594-
fn byte_str_lit(lit: &str) -> Lrc<Vec<u8>> {
595-
let mut res = Vec::with_capacity(lit.len());
596-
597-
let error = |i| panic!("lexer should have rejected {} at {}", lit, i);
598-
599-
/// Eat everything up to a non-whitespace.
600-
fn eat<I: Iterator<Item=(usize, u8)>>(it: &mut iter::Peekable<I>) {
601-
loop {
602-
match it.peek().map(|x| x.1) {
603-
Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => {
604-
it.next();
605-
},
606-
_ => { break; }
607-
}
608-
}
609-
}
610-
611-
// byte string literals *must* be ASCII, but the escapes don't have to be
612-
let mut chars = lit.bytes().enumerate().peekable();
613-
loop {
614-
match chars.next() {
615-
Some((i, b'\\')) => {
616-
match chars.peek().unwrap_or_else(|| error(i)).1 {
617-
b'\n' => eat(&mut chars),
618-
b'\r' => {
619-
chars.next();
620-
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
621-
panic!("lexer accepted bare CR");
622-
}
623-
eat(&mut chars);
624-
}
625-
_ => {
626-
// otherwise, a normal escape
627-
let (c, n) = byte_lit(&lit[i..]);
628-
// we don't need to move past the first \
629-
for _ in 0..n - 1 {
630-
chars.next();
631-
}
632-
res.push(c);
633-
}
634-
}
635-
},
636-
Some((i, b'\r')) => {
637-
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
638-
panic!("lexer accepted bare CR");
639-
}
640-
chars.next();
641-
res.push(b'\n');
642-
}
643-
Some((_, c)) => res.push(c),
644-
None => break,
645-
}
646-
}
647-
648-
Lrc::new(res)
649-
}
650-
651469
fn integer_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
652470
-> Option<ast::LitKind> {
653471
// s can only be ascii, byte indexing is fine

0 commit comments

Comments
 (0)