Skip to content

Commit 43ee522

Browse files
authored
Unrolled build for rust-lang#133201
Rollup merge of rust-lang#133201 - nnethercote:rm-TokenKind-InvalidPrefix, r=compiler-errors Remove `TokenKind::InvalidPrefix` It's not needed. Best reviewed one commit at a time. r? `@estebank`
2 parents ee612c4 + e9a0c3c commit 43ee522

File tree

4 files changed

+55
-65
lines changed

4 files changed

+55
-65
lines changed

compiler/rustc_lexer/src/lib.rs

+49-57
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,10 @@ impl Token {
5757
/// Enum representing common lexeme types.
5858
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
5959
pub enum TokenKind {
60-
// Multi-char tokens:
61-
/// "// comment"
60+
/// A line comment, e.g. `// comment`.
6261
LineComment { doc_style: Option<DocStyle> },
6362

64-
/// `/* block comment */`
63+
/// A block comment, e.g. `/* block comment */`.
6564
///
6665
/// Block comments can be recursive, so a sequence like `/* /* */`
6766
/// will not be considered terminated and will result in a parsing error.
@@ -70,18 +69,17 @@ pub enum TokenKind {
7069
/// Any whitespace character sequence.
7170
Whitespace,
7271

73-
/// "ident" or "continue"
74-
///
75-
/// At this step, keywords are also considered identifiers.
72+
/// An identifier or keyword, e.g. `ident` or `continue`.
7673
Ident,
7774

78-
/// Like the above, but containing invalid unicode codepoints.
75+
/// An identifier that is invalid because it contains emoji.
7976
InvalidIdent,
8077

81-
/// "r#ident"
78+
/// A raw identifier, e.g. "r#ident".
8279
RawIdent,
8380

84-
/// An unknown prefix, like `foo#`, `foo'`, `foo"`.
81+
/// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
82+
/// literal prefixes that contain emoji, which are considered "invalid".
8583
///
8684
/// Note that only the
8785
/// prefix (`foo`) is included in the token, not the separator (which is
@@ -93,87 +91,83 @@ pub enum TokenKind {
9391

9492
/// An unknown prefix in a lifetime, like `'foo#`.
9593
///
96-
/// Note that like above, only the `'` and prefix are included in the token
94+
/// Like `UnknownPrefix`, only the `'` and prefix are included in the token
9795
/// and not the separator.
9896
UnknownPrefixLifetime,
9997

100-
/// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`.
98+
/// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
99+
/// several tokens: `'r` and `#` and `foo`.
101100
RawLifetime,
102101

103-
/// Similar to the above, but *always* an error on every edition. This is used
104-
/// for emoji identifier recovery, as those are not meant to be ever accepted.
105-
InvalidPrefix,
106-
107102
/// Guarded string literal prefix: `#"` or `##`.
108103
///
109104
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
110105
/// Split into the component tokens on older editions.
111106
GuardedStrPrefix,
112107

113-
/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
108+
/// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
114109
/// suffix, but may be present here on string and float literals. Users of
115110
/// this type will need to check for and reject that case.
116111
///
117112
/// See [LiteralKind] for more details.
118113
Literal { kind: LiteralKind, suffix_start: u32 },
119114

120-
/// "'a"
115+
/// A lifetime, e.g. `'a`.
121116
Lifetime { starts_with_number: bool },
122117

123-
// One-char tokens:
124-
/// ";"
118+
/// `;`
125119
Semi,
126-
/// ","
120+
/// `,`
127121
Comma,
128-
/// "."
122+
/// `.`
129123
Dot,
130-
/// "("
124+
/// `(`
131125
OpenParen,
132-
/// ")"
126+
/// `)`
133127
CloseParen,
134-
/// "{"
128+
/// `{`
135129
OpenBrace,
136-
/// "}"
130+
/// `}`
137131
CloseBrace,
138-
/// "["
132+
/// `[`
139133
OpenBracket,
140-
/// "]"
134+
/// `]`
141135
CloseBracket,
142-
/// "@"
136+
/// `@`
143137
At,
144-
/// "#"
138+
/// `#`
145139
Pound,
146-
/// "~"
140+
/// `~`
147141
Tilde,
148-
/// "?"
142+
/// `?`
149143
Question,
150-
/// ":"
144+
/// `:`
151145
Colon,
152-
/// "$"
146+
/// `$`
153147
Dollar,
154-
/// "="
148+
/// `=`
155149
Eq,
156-
/// "!"
150+
/// `!`
157151
Bang,
158-
/// "<"
152+
/// `<`
159153
Lt,
160-
/// ">"
154+
/// `>`
161155
Gt,
162-
/// "-"
156+
/// `-`
163157
Minus,
164-
/// "&"
158+
/// `&`
165159
And,
166-
/// "|"
160+
/// `|`
167161
Or,
168-
/// "+"
162+
/// `+`
169163
Plus,
170-
/// "*"
164+
/// `*`
171165
Star,
172-
/// "/"
166+
/// `/`
173167
Slash,
174-
/// "^"
168+
/// `^`
175169
Caret,
176-
/// "%"
170+
/// `%`
177171
Percent,
178172

179173
/// Unknown token, not expected by the lexer, e.g. "№"
@@ -468,7 +462,7 @@ impl Cursor<'_> {
468462
Literal { kind, suffix_start }
469463
}
470464
// Identifier starting with an emoji. Only lexed for graceful error recovery.
471-
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
465+
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
472466
_ => Unknown,
473467
};
474468
let res = Token::new(token_kind, self.pos_within_token());
@@ -552,24 +546,22 @@ impl Cursor<'_> {
552546
// we see a prefix here, it is definitely an unknown prefix.
553547
match self.first() {
554548
'#' | '"' | '\'' => UnknownPrefix,
555-
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
549+
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
556550
_ => Ident,
557551
}
558552
}
559553

560-
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
554+
fn invalid_ident(&mut self) -> TokenKind {
561555
// Start is already eaten, eat the rest of identifier.
562556
self.eat_while(|c| {
563-
unicode_xid::UnicodeXID::is_xid_continue(c)
564-
|| (!c.is_ascii() && c.is_emoji_char())
565-
|| c == '\u{200d}'
557+
const ZERO_WIDTH_JOINER: char = '\u{200d}';
558+
is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
566559
});
567-
// Known prefixes must have been handled earlier. So if
568-
// we see a prefix here, it is definitely an unknown prefix.
569-
match self.first() {
570-
'#' | '"' | '\'' => InvalidPrefix,
571-
_ => InvalidIdent,
572-
}
560+
// An invalid identifier followed by '#' or '"' or '\'' could be
561+
// interpreted as an invalid literal prefix. We don't bother doing that
562+
// because the treatment of invalid identifiers and invalid prefixes
563+
// would be the same.
564+
InvalidIdent
573565
}
574566

575567
fn c_or_byte_string(

compiler/rustc_parse/src/lexer/mod.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
213213
let ident = Symbol::intern(lifetime_name);
214214
token::Lifetime(ident, IdentIsRaw::No)
215215
}
216-
rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix
216+
rustc_lexer::TokenKind::InvalidIdent
217217
// Do not recover an identifier with emoji if the codepoint is a confusable
218218
// with a recoverable substitution token, like `➖`.
219219
if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
@@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
359359
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
360360

361361
rustc_lexer::TokenKind::Unknown
362-
| rustc_lexer::TokenKind::InvalidIdent
363-
| rustc_lexer::TokenKind::InvalidPrefix => {
362+
| rustc_lexer::TokenKind::InvalidIdent => {
364363
// Don't emit diagnostics for sequences of the same invalid token
365364
if swallow_next_invalid > 0 {
366365
swallow_next_invalid -= 1;

src/librustdoc/html/highlight.rs

+3-4
Original file line numberDiff line numberDiff line change
@@ -861,10 +861,9 @@ impl<'src> Classifier<'src> {
861861
},
862862
Some(c) => c,
863863
},
864-
TokenKind::RawIdent
865-
| TokenKind::UnknownPrefix
866-
| TokenKind::InvalidPrefix
867-
| TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
864+
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
865+
Class::Ident(self.new_span(before, text))
866+
}
868867
TokenKind::Lifetime { .. }
869868
| TokenKind::RawLifetime
870869
| TokenKind::UnknownPrefixLifetime => Class::Lifetime,

src/tools/rust-analyzer/crates/parser/src/lexed_str.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ impl<'a> Converter<'a> {
183183
rustc_lexer::TokenKind::Ident => {
184184
SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
185185
}
186-
rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
186+
rustc_lexer::TokenKind::InvalidIdent => {
187187
err = "Ident contains invalid characters";
188188
IDENT
189189
}

0 commit comments

Comments
 (0)