@@ -35,8 +35,8 @@ pub use unicode_xid::UNICODE_VERSION as UNICODE_XID_VERSION;
35
35
36
36
use self :: LiteralKind :: * ;
37
37
use self :: TokenKind :: * ;
38
- pub use crate :: cursor:: Cursor ;
39
38
use crate :: cursor:: EOF_CHAR ;
39
+ pub use crate :: cursor:: { Cursor , FrontmatterAllowed } ;
40
40
41
41
/// Parsed token.
42
42
/// It doesn't contain information about data that has been parsed,
@@ -57,17 +57,27 @@ impl Token {
57
57
#[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
58
58
pub enum TokenKind {
59
59
/// A line comment, e.g. `// comment`.
60
- LineComment { doc_style : Option < DocStyle > } ,
60
+ LineComment {
61
+ doc_style : Option < DocStyle > ,
62
+ } ,
61
63
62
64
/// A block comment, e.g. `/* block comment */`.
63
65
///
64
66
/// Block comments can be recursive, so a sequence like `/* /* */`
65
67
/// will not be considered terminated and will result in a parsing error.
66
- BlockComment { doc_style : Option < DocStyle > , terminated : bool } ,
68
+ BlockComment {
69
+ doc_style : Option < DocStyle > ,
70
+ terminated : bool ,
71
+ } ,
67
72
68
73
/// Any whitespace character sequence.
69
74
Whitespace ,
70
75
76
+ Frontmatter {
77
+ has_invalid_preceding_whitespace : bool ,
78
+ invalid_infostring : bool ,
79
+ } ,
80
+
71
81
/// An identifier or keyword, e.g. `ident` or `continue`.
72
82
Ident ,
73
83
@@ -109,10 +119,15 @@ pub enum TokenKind {
109
119
/// this type will need to check for and reject that case.
110
120
///
111
121
/// See [LiteralKind] for more details.
112
- Literal { kind : LiteralKind , suffix_start : u32 } ,
122
+ Literal {
123
+ kind : LiteralKind ,
124
+ suffix_start : u32 ,
125
+ } ,
113
126
114
127
/// A lifetime, e.g. `'a`.
115
- Lifetime { starts_with_number : bool } ,
128
+ Lifetime {
129
+ starts_with_number : bool ,
130
+ } ,
116
131
117
132
/// `;`
118
133
Semi ,
@@ -280,7 +295,7 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
280
295
#[ inline]
281
296
pub fn validate_raw_str ( input : & str , prefix_len : u32 ) -> Result < ( ) , RawStrError > {
282
297
debug_assert ! ( !input. is_empty( ) ) ;
283
- let mut cursor = Cursor :: new ( input) ;
298
+ let mut cursor = Cursor :: new ( input, FrontmatterAllowed :: No ) ;
284
299
// Move past the leading `r` or `br`.
285
300
for _ in 0 ..prefix_len {
286
301
cursor. bump ( ) . unwrap ( ) ;
@@ -290,7 +305,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
290
305
291
306
/// Creates an iterator that produces tokens from the input string.
292
307
pub fn tokenize ( input : & str ) -> impl Iterator < Item = Token > {
293
- let mut cursor = Cursor :: new ( input) ;
308
+ let mut cursor = Cursor :: new ( input, FrontmatterAllowed :: No ) ;
294
309
std:: iter:: from_fn ( move || {
295
310
let token = cursor. advance_token ( ) ;
296
311
if token. kind != TokenKind :: Eof { Some ( token) } else { None }
@@ -361,7 +376,34 @@ impl Cursor<'_> {
361
376
Some ( c) => c,
362
377
None => return Token :: new ( TokenKind :: Eof , 0 ) ,
363
378
} ;
379
+
364
380
let token_kind = match first_char {
381
+ c if matches ! ( self . frontmatter_allowed, FrontmatterAllowed :: Yes )
382
+ && is_whitespace ( c) =>
383
+ {
384
+ let mut last = first_char;
385
+ while is_whitespace ( self . first ( ) ) {
386
+ let Some ( c) = self . bump ( ) else {
387
+ break ;
388
+ } ;
389
+ last = c;
390
+ }
391
+ // invalid frontmatter opening as whitespace preceding it isn't newline.
392
+ // combine the whitespace and the frontmatter to a single token as we shall
393
+ // error later.
394
+ if last != '\n' && self . as_str ( ) . starts_with ( "---" ) {
395
+ self . bump ( ) ;
396
+ self . frontmatter ( true )
397
+ } else {
398
+ Whitespace
399
+ }
400
+ }
401
+ '-' if matches ! ( self . frontmatter_allowed, FrontmatterAllowed :: Yes )
402
+ && self . as_str ( ) . starts_with ( "--" ) =>
403
+ {
404
+ // happy path
405
+ self . frontmatter ( false )
406
+ }
365
407
// Slash, comment or block comment.
366
408
'/' => match self . first ( ) {
367
409
'/' => self . line_comment ( ) ,
@@ -464,11 +506,107 @@ impl Cursor<'_> {
464
506
c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . invalid_ident ( ) ,
465
507
_ => Unknown ,
466
508
} ;
509
+ if matches ! ( self . frontmatter_allowed, FrontmatterAllowed :: Yes )
510
+ && !matches ! ( token_kind, Whitespace )
511
+ {
512
+ // stop allowing frontmatters after first non-whitespace token
513
+ self . frontmatter_allowed = FrontmatterAllowed :: No ;
514
+ }
467
515
let res = Token :: new ( token_kind, self . pos_within_token ( ) ) ;
468
516
self . reset_pos_within_token ( ) ;
469
517
res
470
518
}
471
519
520
+ /// Given that one `-` was eaten, eat the rest of the frontmatter.
521
+ fn frontmatter ( & mut self , has_invalid_preceding_whitespace : bool ) -> TokenKind {
522
+ debug_assert_eq ! ( '-' , self . prev( ) ) ;
523
+
524
+ let pos = self . pos_within_token ( ) ;
525
+ self . eat_while ( |c| c == '-' ) ;
526
+
527
+ // one `-` is eaten by the caller.
528
+ let length_opening = self . pos_within_token ( ) - pos + 1 ;
529
+
530
+ // must be ensured by the caller
531
+ debug_assert ! ( length_opening >= 3 ) ;
532
+
533
+ // copied from `eat_identifier`, but allows `.` in infostring to allow something like
534
+ // `---Cargo.toml` as a valid opener
535
+ if is_id_start ( self . first ( ) ) {
536
+ self . bump ( ) ;
537
+ self . eat_while ( |c| is_id_continue ( c) || c == '.' ) ;
538
+ }
539
+
540
+ self . eat_while ( |ch| ch != '\n' && is_whitespace ( ch) ) ;
541
+ let invalid_infostring = self . first ( ) != '\n' ;
542
+
543
+ let mut s = self . as_str ( ) ;
544
+ let mut found = false ;
545
+ while let Some ( closing) = s. find ( & "-" . repeat ( length_opening as usize ) ) {
546
+ let preceding_chars_start = s[ ..closing] . rfind ( "\n " ) . map_or ( 0 , |i| i + 1 ) ;
547
+ if s[ preceding_chars_start..closing] . chars ( ) . all ( is_whitespace) {
548
+ // candidate found
549
+ self . bump_bytes ( closing) ;
550
+ // in case like
551
+ // ---cargo
552
+ // --- blahblah
553
+ // or
554
+ // ---cargo
555
+ // ----
556
+ // combine those stuff into this frontmatter token such that it gets detected later.
557
+ self . eat_until ( b'\n' ) ;
558
+ found = true ;
559
+ break ;
560
+ } else {
561
+ s = & s[ closing + length_opening as usize ..] ;
562
+ }
563
+ }
564
+
565
+ if !found {
566
+ // recovery strategy: a closing statement might have precending whitespace/newline
567
+ // but not have enough dashes to properly close. In this case, we eat until there,
568
+ // and report a mismatch in the parser.
569
+ let mut rest = self . as_str ( ) ;
570
+ // We can look for a shorter closing (starting with four dashes but closing with three)
571
+ // and other indications that Rust has started and the infostring has ended.
572
+ let mut potential_closing = rest
573
+ . find ( "\n ---" )
574
+ // n.b. only in the case where there are dashes, we move the index to the line where
575
+ // the dashes start as we eat to include that line. For other cases those are Rust code
576
+ // and not included in the frontmatter.
577
+ . map ( |x| x + 1 )
578
+ . or_else ( || rest. find ( "\n use" ) )
579
+ . or_else ( || rest. find ( "\n //!" ) )
580
+ . or_else ( || rest. find ( "\n #![" ) ) ;
581
+
582
+ if potential_closing. is_none ( ) {
583
+ // a less fortunate recovery if all else fails which finds any dashes preceded by whitespace
584
+ // on a standalone line. Might be wrong.
585
+ while let Some ( closing) = rest. find ( "---" ) {
586
+ let preceding_chars_start = rest[ ..closing] . rfind ( "\n " ) . map_or ( 0 , |i| i + 1 ) ;
587
+ if rest[ preceding_chars_start..closing] . chars ( ) . all ( is_whitespace) {
588
+ // candidate found
589
+ potential_closing = Some ( closing) ;
590
+ break ;
591
+ } else {
592
+ rest = & rest[ closing + 3 ..] ;
593
+ }
594
+ }
595
+ }
596
+
597
+ if let Some ( potential_closing) = potential_closing {
598
+ // bump to the potential closing, and eat everything on that line.
599
+ self . bump_bytes ( potential_closing) ;
600
+ self . eat_until ( b'\n' ) ;
601
+ } else {
602
+ // eat everything. this will get reported as an unclosed frontmatter.
603
+ self . eat_while ( |_| true ) ;
604
+ }
605
+ }
606
+
607
+ Frontmatter { has_invalid_preceding_whitespace, invalid_infostring }
608
+ }
609
+
472
610
fn line_comment ( & mut self ) -> TokenKind {
473
611
debug_assert ! ( self . prev( ) == '/' && self . first( ) == '/' ) ;
474
612
self . bump ( ) ;
0 commit comments