@@ -18,7 +18,6 @@ use log::debug;
18
18
19
19
use rustc_data_structures:: fx:: FxHashSet ;
20
20
use std:: borrow:: Cow ;
21
- use std:: iter;
22
21
use std:: path:: { Path , PathBuf } ;
23
22
use std:: str;
24
23
@@ -33,6 +32,11 @@ pub mod attr;
33
32
34
33
pub mod classify;
35
34
35
+ pub ( crate ) mod unescape;
36
+ use unescape:: { unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError } ;
37
+
38
+ pub ( crate ) mod unescape_error_reporting;
39
+
36
40
/// Info about a parsing session.
37
41
pub struct ParseSess {
38
42
pub span_diagnostic : Handler ,
@@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
306
310
Parser :: new ( sess, stream, None , true , false )
307
311
}
308
312
309
- /// Parses a string representing a character literal into its final form.
310
- /// Rather than just accepting/rejecting a given literal, unescapes it as
311
- /// well. Can take any slice prefixed by a character escape. Returns the
312
- /// character and the number of characters consumed.
313
- fn char_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> ( char , isize ) {
314
- use std:: char;
315
-
316
- // Handle non-escaped chars first.
317
- if lit. as_bytes ( ) [ 0 ] != b'\\' {
318
- // If the first byte isn't '\\' it might part of a multi-byte char, so
319
- // get the char with chars().
320
- let c = lit. chars ( ) . next ( ) . unwrap ( ) ;
321
- return ( c, 1 ) ;
322
- }
323
-
324
- // Handle escaped chars.
325
- match lit. as_bytes ( ) [ 1 ] as char {
326
- '"' => ( '"' , 2 ) ,
327
- 'n' => ( '\n' , 2 ) ,
328
- 'r' => ( '\r' , 2 ) ,
329
- 't' => ( '\t' , 2 ) ,
330
- '\\' => ( '\\' , 2 ) ,
331
- '\'' => ( '\'' , 2 ) ,
332
- '0' => ( '\0' , 2 ) ,
333
- 'x' => {
334
- let v = u32:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . unwrap ( ) ;
335
- let c = char:: from_u32 ( v) . unwrap ( ) ;
336
- ( c, 4 )
337
- }
338
- 'u' => {
339
- assert_eq ! ( lit. as_bytes( ) [ 2 ] , b'{' ) ;
340
- let idx = lit. find ( '}' ) . unwrap ( ) ;
341
-
342
- // All digits and '_' are ascii, so treat each byte as a char.
343
- let mut v: u32 = 0 ;
344
- for c in lit[ 3 ..idx] . bytes ( ) {
345
- let c = char:: from ( c) ;
346
- if c != '_' {
347
- let x = c. to_digit ( 16 ) . unwrap ( ) ;
348
- v = v. checked_mul ( 16 ) . unwrap ( ) . checked_add ( x) . unwrap ( ) ;
349
- }
350
- }
351
- let c = char:: from_u32 ( v) . unwrap_or_else ( || {
352
- if let Some ( ( span, diag) ) = diag {
353
- let mut diag = diag. struct_span_err ( span, "invalid unicode character escape" ) ;
354
- if v > 0x10FFFF {
355
- diag. help ( "unicode escape must be at most 10FFFF" ) . emit ( ) ;
356
- } else {
357
- diag. help ( "unicode escape must not be a surrogate" ) . emit ( ) ;
358
- }
359
- }
360
- '\u{FFFD}'
361
- } ) ;
362
- ( c, ( idx + 1 ) as isize )
363
- }
364
- _ => panic ! ( "lexer should have rejected a bad character escape {}" , lit)
365
- }
366
- }
367
-
368
- /// Parses a string representing a string literal into its final form. Does unescaping.
369
- fn str_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> String {
370
- debug ! ( "str_lit: given {}" , lit. escape_default( ) ) ;
371
- let mut res = String :: with_capacity ( lit. len ( ) ) ;
372
-
373
- let error = |i| format ! ( "lexer should have rejected {} at {}" , lit, i) ;
374
-
375
- /// Eat everything up to a non-whitespace.
376
- fn eat < ' a > ( it : & mut iter:: Peekable < str:: CharIndices < ' a > > ) {
377
- loop {
378
- match it. peek ( ) . map ( |x| x. 1 ) {
379
- Some ( ' ' ) | Some ( '\n' ) | Some ( '\r' ) | Some ( '\t' ) => {
380
- it. next ( ) ;
381
- } ,
382
- _ => { break ; }
383
- }
384
- }
385
- }
386
-
387
- let mut chars = lit. char_indices ( ) . peekable ( ) ;
388
- while let Some ( ( i, c) ) = chars. next ( ) {
389
- match c {
390
- '\\' => {
391
- let ch = chars. peek ( ) . unwrap_or_else ( || {
392
- panic ! ( "{}" , error( i) )
393
- } ) . 1 ;
394
-
395
- if ch == '\n' {
396
- eat ( & mut chars) ;
397
- } else if ch == '\r' {
398
- chars. next ( ) ;
399
- let ch = chars. peek ( ) . unwrap_or_else ( || {
400
- panic ! ( "{}" , error( i) )
401
- } ) . 1 ;
402
-
403
- if ch != '\n' {
404
- panic ! ( "lexer accepted bare CR" ) ;
405
- }
406
- eat ( & mut chars) ;
407
- } else {
408
- // otherwise, a normal escape
409
- let ( c, n) = char_lit ( & lit[ i..] , diag) ;
410
- for _ in 0 ..n - 1 { // we don't need to move past the first \
411
- chars. next ( ) ;
412
- }
413
- res. push ( c) ;
414
- }
415
- } ,
416
- '\r' => {
417
- let ch = chars. peek ( ) . unwrap_or_else ( || {
418
- panic ! ( "{}" , error( i) )
419
- } ) . 1 ;
420
-
421
- if ch != '\n' {
422
- panic ! ( "lexer accepted bare CR" ) ;
423
- }
424
- chars. next ( ) ;
425
- res. push ( '\n' ) ;
426
- }
427
- c => res. push ( c) ,
428
- }
429
- }
430
-
431
- res. shrink_to_fit ( ) ; // probably not going to do anything, unless there was an escape.
432
- debug ! ( "parse_str_lit: returning {}" , res) ;
433
- res
434
- }
435
-
436
313
/// Parses a string representing a raw string literal into its final form. The
437
314
/// only operation this does is convert embedded CRLF into a single LF.
438
315
fn raw_str_lit ( lit : & str ) -> String {
@@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
475
352
use ast:: LitKind ;
476
353
477
354
match lit {
478
- token:: Byte ( i) => ( true , Some ( LitKind :: Byte ( byte_lit ( & i. as_str ( ) ) . 0 ) ) ) ,
479
- token:: Char ( i) => ( true , Some ( LitKind :: Char ( char_lit ( & i. as_str ( ) , diag) . 0 ) ) ) ,
480
- token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
355
+ token:: Byte ( i) => {
356
+ let lit_kind = match unescape_byte ( & i. as_str ( ) ) {
357
+ Ok ( c) => LitKind :: Byte ( c) ,
358
+ Err ( ( _, EscapeError :: MoreThanOneChar ) ) => LitKind :: Err ( i) ,
359
+ Err ( _) => LitKind :: Byte ( 0 ) ,
360
+ } ;
361
+ ( true , Some ( lit_kind) )
362
+ } ,
363
+ token:: Char ( i) => {
364
+ let lit_kind = match unescape_char ( & i. as_str ( ) ) {
365
+ Ok ( c) => LitKind :: Char ( c) ,
366
+ Err ( ( _, EscapeError :: MoreThanOneChar ) ) => LitKind :: Err ( i) ,
367
+ Err ( _) => LitKind :: Char ( '\u{FFFD}' ) ,
368
+ } ;
369
+ ( true , Some ( lit_kind) )
370
+ } ,
371
+ token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
481
372
482
373
// There are some valid suffixes for integer and float literals,
483
374
// so all the handling is done internally.
@@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
491
382
// string in the Token.
492
383
let s = & sym. as_str ( ) ;
493
384
if s. as_bytes ( ) . iter ( ) . any ( |& c| c == b'\\' || c == b'\r' ) {
494
- sym = Symbol :: intern ( & str_lit ( s, diag) ) ;
385
+ let mut buf = String :: with_capacity ( s. len ( ) ) ;
386
+ unescape_str ( s, & mut |_, unescaped_char| {
387
+ match unescaped_char {
388
+ Ok ( c) => buf. push ( c) ,
389
+ Err ( _) => buf. push ( '\u{FFFD}' ) ,
390
+ }
391
+ } ) ;
392
+ sym = Symbol :: intern ( & buf)
495
393
}
496
394
( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Cooked ) ) )
497
395
}
@@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
504
402
( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Raw ( n) ) ) )
505
403
}
506
404
token:: ByteStr ( i) => {
507
- ( true , Some ( LitKind :: ByteStr ( byte_str_lit ( & i. as_str ( ) ) ) ) )
405
+ let s = & i. as_str ( ) ;
406
+ let mut buf = Vec :: with_capacity ( s. len ( ) ) ;
407
+ unescape_byte_str ( s, & mut |_, unescaped_byte| {
408
+ match unescaped_byte {
409
+ Ok ( c) => buf. push ( c) ,
410
+ Err ( _) => buf. push ( 0 ) ,
411
+ }
412
+ } ) ;
413
+ buf. shrink_to_fit ( ) ;
414
+ ( true , Some ( LitKind :: ByteStr ( Lrc :: new ( buf) ) ) )
508
415
}
509
416
token:: ByteStrRaw ( i, _) => {
510
417
( true , Some ( LitKind :: ByteStr ( Lrc :: new ( i. to_string ( ) . into_bytes ( ) ) ) ) )
@@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
559
466
filtered_float_lit ( Symbol :: intern ( s) , suffix, diag)
560
467
}
561
468
562
- /// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
563
- fn byte_lit ( lit : & str ) -> ( u8 , usize ) {
564
- let err = |i| format ! ( "lexer accepted invalid byte literal {} step {}" , lit, i) ;
565
-
566
- if lit. len ( ) == 1 {
567
- ( lit. as_bytes ( ) [ 0 ] , 1 )
568
- } else {
569
- assert_eq ! ( lit. as_bytes( ) [ 0 ] , b'\\' , "{}" , err( 0 ) ) ;
570
- let b = match lit. as_bytes ( ) [ 1 ] {
571
- b'"' => b'"' ,
572
- b'n' => b'\n' ,
573
- b'r' => b'\r' ,
574
- b't' => b'\t' ,
575
- b'\\' => b'\\' ,
576
- b'\'' => b'\'' ,
577
- b'0' => b'\0' ,
578
- _ => {
579
- match u64:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . ok ( ) {
580
- Some ( c) =>
581
- if c > 0xFF {
582
- panic ! ( err( 2 ) )
583
- } else {
584
- return ( c as u8 , 4 )
585
- } ,
586
- None => panic ! ( err( 3 ) )
587
- }
588
- }
589
- } ;
590
- ( b, 2 )
591
- }
592
- }
593
-
594
- fn byte_str_lit ( lit : & str ) -> Lrc < Vec < u8 > > {
595
- let mut res = Vec :: with_capacity ( lit. len ( ) ) ;
596
-
597
- let error = |i| panic ! ( "lexer should have rejected {} at {}" , lit, i) ;
598
-
599
- /// Eat everything up to a non-whitespace.
600
- fn eat < I : Iterator < Item =( usize , u8 ) > > ( it : & mut iter:: Peekable < I > ) {
601
- loop {
602
- match it. peek ( ) . map ( |x| x. 1 ) {
603
- Some ( b' ' ) | Some ( b'\n' ) | Some ( b'\r' ) | Some ( b'\t' ) => {
604
- it. next ( ) ;
605
- } ,
606
- _ => { break ; }
607
- }
608
- }
609
- }
610
-
611
- // byte string literals *must* be ASCII, but the escapes don't have to be
612
- let mut chars = lit. bytes ( ) . enumerate ( ) . peekable ( ) ;
613
- loop {
614
- match chars. next ( ) {
615
- Some ( ( i, b'\\' ) ) => {
616
- match chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 {
617
- b'\n' => eat ( & mut chars) ,
618
- b'\r' => {
619
- chars. next ( ) ;
620
- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
621
- panic ! ( "lexer accepted bare CR" ) ;
622
- }
623
- eat ( & mut chars) ;
624
- }
625
- _ => {
626
- // otherwise, a normal escape
627
- let ( c, n) = byte_lit ( & lit[ i..] ) ;
628
- // we don't need to move past the first \
629
- for _ in 0 ..n - 1 {
630
- chars. next ( ) ;
631
- }
632
- res. push ( c) ;
633
- }
634
- }
635
- } ,
636
- Some ( ( i, b'\r' ) ) => {
637
- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
638
- panic ! ( "lexer accepted bare CR" ) ;
639
- }
640
- chars. next ( ) ;
641
- res. push ( b'\n' ) ;
642
- }
643
- Some ( ( _, c) ) => res. push ( c) ,
644
- None => break ,
645
- }
646
- }
647
-
648
- Lrc :: new ( res)
649
- }
650
-
651
469
fn integer_lit ( s : & str , suffix : Option < Symbol > , diag : Option < ( Span , & Handler ) > )
652
470
-> Option < ast:: LitKind > {
653
471
// s can only be ascii, byte indexing is fine
0 commit comments