Skip to content

Commit 43b0a00

Browse files
committed
Use Peekable in lexer_unescape
Use a Peekable<CharIndices<'_>> instead of going back and forth between string slice and chars iterator. * this gets rid of most position computations * allows removal of double traversal for correct backslash newline escapes in skip_ascii_whitespace Improves documentation
1 parent 8239a37 commit 43b0a00

File tree

1 file changed

+37
-41
lines changed

1 file changed

+37
-41
lines changed

compiler/rustc_lexer/src/unescape.rs

+37-41
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
//! Utilities for validating string and char literals and turning them into
22
//! values they represent.
33
4+
use std::iter::{Peekable, from_fn};
45
use std::ops::Range;
5-
use std::str::Chars;
6+
use std::str::CharIndices;
67

78
use Mode::*;
89

@@ -231,7 +232,7 @@ impl Mode {
231232
}
232233

233234
fn scan_escape<T: From<char> + From<u8>>(
234-
chars: &mut Chars<'_>,
235+
chars: &mut impl Iterator<Item = char>,
235236
mode: Mode,
236237
) -> Result<T, EscapeError> {
237238
// Previous character was '\\', unescape what follows.
@@ -268,7 +269,10 @@ fn scan_escape<T: From<char> + From<u8>>(
268269
Ok(T::from(res))
269270
}
270271

271-
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
272+
fn scan_unicode(
273+
chars: &mut impl Iterator<Item = char>,
274+
allow_unicode_escapes: bool,
275+
) -> Result<char, EscapeError> {
272276
// We've parsed '\u', now we have to parse '{..}'.
273277

274278
if chars.next() != Some('{') {
@@ -326,7 +330,10 @@ fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError>
326330
if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
327331
}
328332

329-
fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
333+
fn unescape_char_or_byte(
334+
chars: &mut impl Iterator<Item = char>,
335+
mode: Mode,
336+
) -> Result<char, EscapeError> {
330337
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
331338
let res = match c {
332339
'\\' => scan_escape(chars, mode),
@@ -346,63 +353,52 @@ fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, c
346353
where
347354
F: FnMut(Range<usize>, Result<T, EscapeError>),
348355
{
349-
let mut chars = src.chars();
350356
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
351357

352-
// The `start` and `end` computation here is complicated because
353-
// `skip_ascii_whitespace` makes us to skip over chars without counting
354-
// them in the range computation.
355-
while let Some(c) = chars.next() {
356-
let start = src.len() - chars.as_str().len() - c.len_utf8();
358+
let mut chars = src.char_indices().peekable();
359+
while let Some((start, c)) = chars.next() {
357360
let res = match c {
358-
'\\' => {
359-
match chars.clone().next() {
360-
Some('\n') => {
361-
// Rust language specification requires us to skip whitespaces
362-
// if unescaped '\' character is followed by '\n'.
363-
// For details see [Rust language reference]
364-
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
365-
skip_ascii_whitespace(&mut chars, start, &mut |range, err| {
366-
callback(range, Err(err))
367-
});
368-
continue;
369-
}
370-
_ => scan_escape::<T>(&mut chars, mode),
371-
}
361+
// skip whitespace for backslash newline, see [Rust language reference]
362+
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
363+
'\\' if chars.next_if(|&(_, c)| c == '\n').is_some() => {
364+
let mut callback_err = |range, err| callback(range, Err(err));
365+
skip_ascii_whitespace(&mut chars, start, &mut callback_err);
366+
continue;
372367
}
368+
'\\' => scan_escape::<T>(&mut from_fn(|| chars.next().map(|i| i.1)), mode),
373369
'"' => Err(EscapeError::EscapeOnlyChar),
374370
'\r' => Err(EscapeError::BareCarriageReturn),
375371
_ => ascii_check(c, allow_unicode_chars).map(T::from),
376372
};
377-
let end = src.len() - chars.as_str().len();
373+
let end = chars.peek().map(|&(end, _)| end).unwrap_or(src.len());
378374
callback(start..end, res);
379375
}
380376
}
381377

382-
fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
378+
/// Skip ASCII whitespace, except for the formfeed character
379+
/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
380+
/// Warns on unescaped newline and following non-ASCII whitespace.
381+
fn skip_ascii_whitespace<F>(chars: &mut Peekable<CharIndices<'_>>, start: usize, callback: &mut F)
383382
where
384383
F: FnMut(Range<usize>, EscapeError),
385384
{
386-
let tail = chars.as_str();
387-
let first_non_space = tail
388-
.bytes()
389-
.position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
390-
.unwrap_or(tail.len());
391-
if tail[1..first_non_space].contains('\n') {
392-
// The +1 accounts for the escaping slash.
393-
let end = start + first_non_space + 1;
385+
// the escaping slash and newline characters add 2 bytes
386+
let mut end = start + 2;
387+
let mut contains_nl = false;
388+
while let Some((_, c)) = chars.next_if(|&(_, c)| c.is_ascii_whitespace() && c != '\x0c') {
389+
end += 1;
390+
contains_nl = contains_nl || c == '\n';
391+
}
392+
393+
if contains_nl {
394394
callback(start..end, EscapeError::MultipleSkippedLinesWarning);
395395
}
396-
let tail = &tail[first_non_space..];
397-
if let Some(c) = tail.chars().next() {
396+
if let Some((_, c)) = chars.peek() {
398397
if c.is_whitespace() {
399-
// For error reporting, we would like the span to contain the character that was not
400-
// skipped. The +1 is necessary to account for the leading \ that started the escape.
401-
let end = start + first_non_space + c.len_utf8() + 1;
402-
callback(start..end, EscapeError::UnskippedWhitespaceWarning);
398+
// for error reporting, include the character that was not skipped in the span
399+
callback(start..end + c.len_utf8(), EscapeError::UnskippedWhitespaceWarning);
403400
}
404401
}
405-
*chars = tail.chars();
406402
}
407403

408404
/// Takes a contents of a string literal (without quotes) and produces a

0 commit comments

Comments
 (0)