@@ -57,11 +57,10 @@ impl Token {
57
57
/// Enum representing common lexeme types.
58
58
#[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
59
59
pub enum TokenKind {
60
- // Multi-char tokens:
61
- /// "// comment"
60
+ /// A line comment, e.g. `// comment`.
62
61
LineComment { doc_style : Option < DocStyle > } ,
63
62
64
- /// `/* block comment */`
63
+ /// A block comment, e.g. `/* block comment */`.
65
64
///
66
65
/// Block comments can be recursive, so a sequence like `/* /* */`
67
66
/// will not be considered terminated and will result in a parsing error.
@@ -70,18 +69,17 @@ pub enum TokenKind {
70
69
/// Any whitespace character sequence.
71
70
Whitespace ,
72
71
73
- /// "ident" or "continue"
74
- ///
75
- /// At this step, keywords are also considered identifiers.
72
+ /// An identifier or keyword, e.g. `ident` or `continue`.
76
73
Ident ,
77
74
78
- /// Like the above, but containing invalid unicode codepoints .
75
+ /// An identifier that is invalid because it contains emoji .
79
76
InvalidIdent ,
80
77
81
- /// "r#ident"
78
+ /// A raw identifier, e.g. "r#ident".
82
79
RawIdent ,
83
80
84
- /// An unknown prefix, like `foo#`, `foo'`, `foo"`.
81
+ /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
82
+ /// literal prefixes that contain emoji, which are considered "invalid".
85
83
///
86
84
/// Note that only the
87
85
/// prefix (`foo`) is included in the token, not the separator (which is
@@ -93,87 +91,83 @@ pub enum TokenKind {
93
91
94
92
/// An unknown prefix in a lifetime, like `'foo#`.
95
93
///
96
- /// Note that like above , only the `'` and prefix are included in the token
94
+ /// Like `UnknownPrefix` , only the `'` and prefix are included in the token
97
95
/// and not the separator.
98
96
UnknownPrefixLifetime ,
99
97
100
- /// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`.
98
+ /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
99
+ /// several tokens: `'r` and `#` and `foo`.
101
100
RawLifetime ,
102
101
103
- /// Similar to the above, but *always* an error on every edition. This is used
104
- /// for emoji identifier recovery, as those are not meant to be ever accepted.
105
- InvalidPrefix ,
106
-
107
102
/// Guarded string literal prefix: `#"` or `##`.
108
103
///
109
104
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
110
105
/// Split into the component tokens on older editions.
111
106
GuardedStrPrefix ,
112
107
113
- /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
108
+ /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
114
109
/// suffix, but may be present here on string and float literals. Users of
115
110
/// this type will need to check for and reject that case.
116
111
///
117
112
/// See [LiteralKind] for more details.
118
113
Literal { kind : LiteralKind , suffix_start : u32 } ,
119
114
120
- /// "'a"
115
+ /// A lifetime, e.g. `'a`.
121
116
Lifetime { starts_with_number : bool } ,
122
117
123
- // One-char tokens:
124
- /// ";"
118
+ /// `;`
125
119
Semi ,
126
- /// ","
120
+ /// `,`
127
121
Comma ,
128
- /// "."
122
+ /// `.`
129
123
Dot ,
130
- /// "("
124
+ /// `(`
131
125
OpenParen ,
132
- /// ")"
126
+ /// `)`
133
127
CloseParen ,
134
- /// "{"
128
+ /// `{`
135
129
OpenBrace ,
136
- /// "}"
130
+ /// `}`
137
131
CloseBrace ,
138
- /// "["
132
+ /// `[`
139
133
OpenBracket ,
140
- /// "]"
134
+ /// `]`
141
135
CloseBracket ,
142
- /// "@"
136
+ /// `@`
143
137
At ,
144
- /// "#"
138
+ /// `#`
145
139
Pound ,
146
- /// "~"
140
+ /// `~`
147
141
Tilde ,
148
- /// "?"
142
+ /// `?`
149
143
Question ,
150
- /// ":"
144
+ /// `:`
151
145
Colon ,
152
- /// "$"
146
+ /// `$`
153
147
Dollar ,
154
- /// "="
148
+ /// `=`
155
149
Eq ,
156
- /// "!"
150
+ /// `!`
157
151
Bang ,
158
- /// "<"
152
+ /// `<`
159
153
Lt ,
160
- /// ">"
154
+ /// `>`
161
155
Gt ,
162
- /// "-"
156
+ /// `-`
163
157
Minus ,
164
- /// "&"
158
+ /// `&`
165
159
And ,
166
- /// "|"
160
+ /// `|`
167
161
Or ,
168
- /// "+"
162
+ /// `+`
169
163
Plus ,
170
- /// "*"
164
+ /// `*`
171
165
Star ,
172
- /// "/"
166
+ /// `/`
173
167
Slash ,
174
- /// "^"
168
+ /// `^`
175
169
Caret ,
176
- /// "%"
170
+ /// `%`
177
171
Percent ,
178
172
179
173
/// Unknown token, not expected by the lexer, e.g. "№"
@@ -468,7 +462,7 @@ impl Cursor<'_> {
468
462
Literal { kind, suffix_start }
469
463
}
470
464
// Identifier starting with an emoji. Only lexed for graceful error recovery.
471
- c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . fake_ident_or_unknown_prefix ( ) ,
465
+ c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . invalid_ident ( ) ,
472
466
_ => Unknown ,
473
467
} ;
474
468
let res = Token :: new ( token_kind, self . pos_within_token ( ) ) ;
@@ -552,24 +546,22 @@ impl Cursor<'_> {
552
546
// we see a prefix here, it is definitely an unknown prefix.
553
547
match self . first ( ) {
554
548
'#' | '"' | '\'' => UnknownPrefix ,
555
- c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . fake_ident_or_unknown_prefix ( ) ,
549
+ c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . invalid_ident ( ) ,
556
550
_ => Ident ,
557
551
}
558
552
}
559
553
560
- fn fake_ident_or_unknown_prefix ( & mut self ) -> TokenKind {
554
+ fn invalid_ident ( & mut self ) -> TokenKind {
561
555
// Start is already eaten, eat the rest of identifier.
562
556
self . eat_while ( |c| {
563
- unicode_xid:: UnicodeXID :: is_xid_continue ( c)
564
- || ( !c. is_ascii ( ) && c. is_emoji_char ( ) )
565
- || c == '\u{200d}'
557
+ const ZERO_WIDTH_JOINER : char = '\u{200d}' ;
558
+ is_id_continue ( c) || ( !c. is_ascii ( ) && c. is_emoji_char ( ) ) || c == ZERO_WIDTH_JOINER
566
559
} ) ;
567
- // Known prefixes must have been handled earlier. So if
568
- // we see a prefix here, it is definitely an unknown prefix.
569
- match self . first ( ) {
570
- '#' | '"' | '\'' => InvalidPrefix ,
571
- _ => InvalidIdent ,
572
- }
560
+ // An invalid identifier followed by '#' or '"' or '\'' could be
561
+ // interpreted as an invalid literal prefix. We don't bother doing that
562
+ // because the treatment of invalid identifiers and invalid prefixes
563
+ // would be the same.
564
+ InvalidIdent
573
565
}
574
566
575
567
fn c_or_byte_string (
0 commit comments