Skip to content

Commit f5d463a

Browse files
committed
Report text_direction_codepoint_in_literal when parsing
- The lint is now reported in code that gets removed/modified/duplicated by macro expansion. - Spans are more accurate - Fixes #140281
1 parent 414482f commit f5d463a

File tree

11 files changed

+293
-159
lines changed

11 files changed

+293
-159
lines changed

compiler/rustc_lint/src/early/diagnostics.rs

+21
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,27 @@ pub(super) fn decorate_lint(
187187
lints::ReservedMultihash { suggestion }.decorate_lint(diag);
188188
}
189189
}
190+
BuiltinLintDiag::HiddenUnicodeCodepoints {
191+
label,
192+
count,
193+
span_label,
194+
labels,
195+
escape,
196+
spans,
197+
} => {
198+
lints::HiddenUnicodeCodepointsDiag {
199+
label: &label,
200+
count,
201+
span_label,
202+
labels: labels.map(|spans| lints::HiddenUnicodeCodepointsDiagLabels { spans }),
203+
sub: if escape {
204+
lints::HiddenUnicodeCodepointsDiagSub::Escape { spans }
205+
} else {
206+
lints::HiddenUnicodeCodepointsDiagSub::NoEscape { spans }
207+
},
208+
}
209+
.decorate_lint(diag);
210+
}
190211
BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
191212
lints::UnusedBuiltinAttribute { invoc_span, attr_name, macro_name }.decorate_lint(diag);
192213
}

compiler/rustc_lint/src/hidden_unicode_codepoints.rs

-136
This file was deleted.

compiler/rustc_lint/src/lib.rs

-3
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ mod errors;
4848
mod expect;
4949
mod for_loops_over_fallibles;
5050
mod foreign_modules;
51-
pub mod hidden_unicode_codepoints;
5251
mod if_let_rescope;
5352
mod impl_trait_overcaptures;
5453
mod internal;
@@ -91,7 +90,6 @@ use deref_into_dyn_supertrait::*;
9190
use drop_forget_useless::*;
9291
use enum_intrinsics_non_enums::EnumIntrinsicsNonEnums;
9392
use for_loops_over_fallibles::*;
94-
use hidden_unicode_codepoints::*;
9593
use if_let_rescope::IfLetRescope;
9694
use impl_trait_overcaptures::ImplTraitOvercaptures;
9795
use internal::*;
@@ -174,7 +172,6 @@ early_lint_methods!(
174172
DeprecatedAttr: DeprecatedAttr::default(),
175173
WhileTrue: WhileTrue,
176174
NonAsciiIdents: NonAsciiIdents,
177-
HiddenUnicodeCodepoints: HiddenUnicodeCodepoints,
178175
IncompleteInternalFeatures: IncompleteInternalFeatures,
179176
RedundantSemicolons: RedundantSemicolons,
180177
UnusedDocComment: UnusedDocComment,

compiler/rustc_lint_defs/src/builtin.rs

+34
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ declare_lint_pass! {
103103
TAIL_EXPR_DROP_ORDER,
104104
TEST_UNSTABLE_LINT,
105105
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
106+
TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
106107
TRIVIAL_CASTS,
107108
TRIVIAL_NUMERIC_CASTS,
108109
TYVAR_BEHIND_RAW_POINTER,
@@ -3811,6 +3812,39 @@ declare_lint! {
38113812
"invisible directionality-changing codepoints in comment"
38123813
}
38133814

3815+
declare_lint! {
3816+
#[allow(text_direction_codepoint_in_literal)]
3817+
/// The `text_direction_codepoint_in_literal` lint detects Unicode codepoints that change the
3818+
/// visual representation of text on screen in a way that does not correspond to their on
3819+
/// memory representation.
3820+
///
3821+
/// ### Explanation
3822+
///
3823+
/// The unicode characters `\u{202A}`, `\u{202B}`, `\u{202D}`, `\u{202E}`, `\u{2066}`,
3824+
/// `\u{2067}`, `\u{2068}`, `\u{202C}` and `\u{2069}` make the flow of text on screen change
3825+
/// its direction on software that supports these codepoints. This makes the text "abc" display
3826+
/// as "cba" on screen. By leveraging software that supports these, people can write specially
3827+
/// crafted literals that make the surrounding code seem like it's performing one action, when
3828+
/// in reality it is performing another. Because of this, we proactively lint against their
3829+
/// presence to avoid surprises.
3830+
///
3831+
/// ### Example
3832+
///
3833+
/// ```rust,compile_fail
3834+
/// #![deny(text_direction_codepoint_in_literal)]
3835+
/// fn main() {
3836+
/// println!("{:?}", '‮');
3837+
/// }
3838+
/// ```
3839+
///
3840+
/// {{produces}}
3841+
///
3842+
pub TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
3843+
Deny,
3844+
"detect special Unicode codepoints that affect the visual representation of text on screen, \
3845+
changing the direction in which text flows",
3846+
}
3847+
38143848
declare_lint! {
38153849
/// The `duplicate_macro_attributes` lint detects when a `#[test]`-like built-in macro
38163850
/// attribute is duplicated on an item. This lint may trigger on `bench`, `cfg_eval`, `test`

compiler/rustc_lint_defs/src/lib.rs

+8
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,14 @@ pub enum BuiltinLintDiag {
698698
is_string: bool,
699699
suggestion: Span,
700700
},
701+
HiddenUnicodeCodepoints {
702+
label: String,
703+
count: usize,
704+
span_label: Span,
705+
labels: Option<Vec<(char, Span)>>,
706+
escape: bool,
707+
spans: Vec<(char, Span)>,
708+
},
701709
TrailingMacro(bool, Ident),
702710
BreakWithLabelAndLoop(Span),
703711
UnicodeTextFlow(Span, String),

compiler/rustc_parse/src/lexer/mod.rs

+87-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use diagnostics::make_unclosed_delims_error;
44
use rustc_ast::ast::{self, AttrStyle};
55
use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
66
use rustc_ast::tokenstream::TokenStream;
7-
use rustc_ast::util::unicode::contains_text_flow_control_chars;
7+
use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
88
use rustc_errors::codes::*;
99
use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
1010
use rustc_lexer::{
@@ -14,7 +14,7 @@ use rustc_literal_escaper::{EscapeError, Mode, unescape_mixed, unescape_unicode}
1414
use rustc_session::lint::BuiltinLintDiag;
1515
use rustc_session::lint::builtin::{
1616
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
17-
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
17+
TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
1818
};
1919
use rustc_session::parse::ParseSess;
2020
use rustc_span::{BytePos, Pos, Span, Symbol, sym};
@@ -174,6 +174,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
174174
// Opening delimiter of the length 3 is not included into the symbol.
175175
let content_start = start + BytePos(3);
176176
let content = self.str_from(content_start);
177+
self.lint_doc_comment_unicode_text_flow(start, content);
177178
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
178179
}
179180
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
@@ -193,6 +194,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
193194
let content_start = start + BytePos(3);
194195
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
195196
let content = self.str_from_to(content_start, content_end);
197+
self.lint_doc_comment_unicode_text_flow(start, content);
196198
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
197199
}
198200
rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
@@ -287,6 +289,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
287289
} else {
288290
None
289291
};
292+
self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
290293
token::Literal(token::Lit { kind, symbol, suffix })
291294
}
292295
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
@@ -481,6 +484,88 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
481484
}
482485
}
483486

487+
fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
488+
if contains_text_flow_control_chars(content) {
489+
self.report_text_direction_codepoint(
490+
content,
491+
self.mk_sp(start, self.pos),
492+
0,
493+
false,
494+
"doc comment",
495+
);
496+
}
497+
}
498+
499+
fn lint_literal_unicode_text_flow(
500+
&mut self,
501+
text: Symbol,
502+
lit_kind: token::LitKind,
503+
span: Span,
504+
label: &'static str,
505+
) {
506+
if !contains_text_flow_control_chars(text.as_str()) {
507+
return;
508+
}
509+
let (padding, point_at_inner_spans) = match lit_kind {
510+
// account for `"` or `'`
511+
token::LitKind::Str | token::LitKind::Char => (1, true),
512+
// account for `c"`
513+
token::LitKind::CStr => (2, true),
514+
// account for `r###"`
515+
token::LitKind::StrRaw(n) => (n as u32 + 2, true),
516+
// account for `cr###"`
517+
token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
518+
// suppress bad literals.
519+
token::LitKind::Err(_) => return,
520+
// Be conservative just in case new literals do support these.
521+
_ => (0, false),
522+
};
523+
self.report_text_direction_codepoint(
524+
text.as_str(),
525+
span,
526+
padding,
527+
point_at_inner_spans,
528+
label,
529+
);
530+
}
531+
532+
fn report_text_direction_codepoint(
533+
&self,
534+
text: &str,
535+
span: Span,
536+
padding: u32,
537+
point_at_inner_spans: bool,
538+
label: &str,
539+
) {
540+
// Obtain the `Span`s for each of the forbidden chars.
541+
let spans: Vec<_> = text
542+
.char_indices()
543+
.filter_map(|(i, c)| {
544+
TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
545+
let lo = span.lo() + BytePos(i as u32 + padding);
546+
(c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
547+
})
548+
})
549+
.collect();
550+
551+
let count = spans.len();
552+
let labels = point_at_inner_spans.then_some(spans.clone());
553+
554+
self.psess.buffer_lint(
555+
TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
556+
span,
557+
ast::CRATE_NODE_ID,
558+
BuiltinLintDiag::HiddenUnicodeCodepoints {
559+
label: label.to_string(),
560+
count,
561+
span_label: span,
562+
labels,
563+
escape: point_at_inner_spans && !spans.is_empty(),
564+
spans,
565+
},
566+
);
567+
}
568+
484569
fn validate_frontmatter(
485570
&self,
486571
start: BytePos,

0 commit comments

Comments
 (0)