Skip to content

[Clang][Comments] Allow HTML tags across multiple lines #120843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,8 @@ Bug Fixes to AST Handling
Miscellaneous Bug Fixes
^^^^^^^^^^^^^^^^^^^^^^^

- HTML tags in comments that span multiple lines are now parsed correctly by Clang's comment parser. (#GH120843)

Miscellaneous Clang Crashes Fixed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
69 changes: 63 additions & 6 deletions clang/lib/AST/CommentLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
return BufferEnd;
}

const char *skipHorizontalWhitespace(const char *BufferPtr,
const char *BufferEnd) {
for (; BufferPtr != BufferEnd; ++BufferPtr) {
if (!isHorizontalWhitespace(*BufferPtr))
return BufferPtr;
}
return BufferEnd;
}

bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
}
Expand Down Expand Up @@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
T.setHTMLTagStartName(Name);

BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
if (BufferPtr == CommentEnd) { // in BCPL comments
State = LS_HTMLStartTag;
return;
}

const char C = *BufferPtr;
if (BufferPtr != CommentEnd &&
(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
(C == '>' || C == '/' || isVerticalWhitespace(C) ||
isHTMLIdentifierStartingCharacter(C)))
State = LS_HTMLStartTag;
}

void Lexer::lexHTMLStartTag(Token &T) {
assert(State == LS_HTMLStartTag);

// Skip leading whitespace and comment decorations
while (isVerticalWhitespace(*BufferPtr)) {
BufferPtr = skipNewline(BufferPtr, CommentEnd);

if (CommentState == LCS_InsideCComment)
skipLineStartingDecorations();

BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
if (BufferPtr == CommentEnd) {
// HTML starting tags must be defined in a single comment block.
// It's likely a user-error where they forgot to terminate the comment.
State = LS_Normal;
// Since at least one newline was skipped and one token needs to be lexed,
// return a newline.
formTokenWithChars(T, BufferPtr, tok::newline);
return;
}
}

const char *TokenPtr = BufferPtr;
char C = *TokenPtr;
if (isHTMLIdentifierCharacter(C)) {
Expand Down Expand Up @@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {

// Now look ahead and return to normal state if we don't see any HTML tokens
// ahead.
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
if (BufferPtr == CommentEnd) {
State = LS_Normal;
return;
}

C = *BufferPtr;
if (!isHTMLIdentifierStartingCharacter(C) &&
if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
State = LS_Normal;
return;
Expand Down Expand Up @@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
BufferPtr++;

CommentState = LCS_InsideBCPLComment;
if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
switch (State) {
case LS_VerbatimBlockFirstLine:
case LS_VerbatimBlockBody:
break;
case LS_HTMLStartTag:
BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
break;
default:
State = LS_Normal;
break;
}
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
goto again;
}
Expand Down Expand Up @@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
EndWhitespace++;

// When lexing the start of an HTML tag (i.e. going through the attributes)
// there won't be any newlines generated.
if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
CommentState = LCS_BeforeComment;
BufferPtr = EndWhitespace;
goto again;
}

// Turn any whitespace between comments (and there is only whitespace
// between them -- guaranteed by comment extraction) into a newline. We
// have two newlines between C comments in total (first one was synthesized
Expand All @@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
BufferPtr += 2;
assert(BufferPtr <= BufferEnd);

// When lexing the start of an HTML tag (i.e. going through the
// attributes) there won't be any newlines generated - whitespace still
// needs to be skipped.
if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
CommentState = LCS_BetweenComments;
goto again;
}

// Synthenize newline just after the C comment, regardless if there is
// actually a newline.
formTokenWithChars(T, BufferPtr, tok::newline);
Expand Down
13 changes: 13 additions & 0 deletions clang/test/AST/ast-dump-comment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ int Test_HTMLTagComment;
// CHECK-NEXT: TextComment{{.*}} Text=" "
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="br" SelfClosing

/// <a
/// href="foo"
/// >Aaa</a>b
int Test_HTMLTagMultilineBCPL;
// CHECK: VarDecl{{.*}}Test_HTMLTagMultilineBCPL
// CHECK-NEXT: FullComment
// CHECK-NEXT: ParagraphComment
// CHECK-NEXT: TextComment{{.*}} Text=" "
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="a" Attrs: "href="foo"
// CHECK-NEXT: TextComment{{.*}} Text="Aaa"
// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="a"
// CHECK-NEXT: TextComment{{.*}} Text="b"

/// \verbatim
/// Aaa
/// \endverbatim
Expand Down
154 changes: 154 additions & 0 deletions clang/unittests/AST/CommentLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1453,6 +1453,160 @@ TEST_F(CommentLexerTest, HTML19) {
ASSERT_EQ(tok::newline, Toks[2].getKind());
}

TEST_F(CommentLexerTest, HTML20) {
const char *Source = "// <a\n"
"// \n"
"// href=\"foo\"\n"
"// \n"
"// bar>text</a>";

std::vector<Token> Toks;

lexString(Source, Toks);

ASSERT_EQ(11U, Toks.size());

ASSERT_EQ(tok::text, Toks[0].getKind());
ASSERT_EQ(StringRef(" "), Toks[0].getText());

ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());

ASSERT_EQ(tok::html_ident, Toks[2].getKind());
ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());

ASSERT_EQ(tok::html_equals, Toks[3].getKind());

ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());

ASSERT_EQ(tok::html_ident, Toks[5].getKind());
ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());

ASSERT_EQ(tok::html_greater, Toks[6].getKind());

ASSERT_EQ(tok::text, Toks[7].getKind());
ASSERT_EQ(StringRef("text"), Toks[7].getText());

ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());

ASSERT_EQ(tok::html_greater, Toks[9].getKind());

ASSERT_EQ(tok::newline, Toks[10].getKind());
}

TEST_F(CommentLexerTest, HTML21) {
const char *Source = "/**\n"
" * <a\n"
" * \n"
" * href=\"foo\"\n"
" * \n"
" * bar>text</a>\n"
" */";

std::vector<Token> Toks;

lexString(Source, Toks);

ASSERT_EQ(15U, Toks.size());

ASSERT_EQ(tok::newline, Toks[0].getKind());

ASSERT_EQ(tok::text, Toks[1].getKind());
ASSERT_EQ(StringRef(" "), Toks[1].getText());

ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());

ASSERT_EQ(tok::html_ident, Toks[3].getKind());
ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());

ASSERT_EQ(tok::html_equals, Toks[4].getKind());

ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());

ASSERT_EQ(tok::html_ident, Toks[6].getKind());
ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());

ASSERT_EQ(tok::html_greater, Toks[7].getKind());

ASSERT_EQ(tok::text, Toks[8].getKind());
ASSERT_EQ(StringRef("text"), Toks[8].getText());

ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());

ASSERT_EQ(tok::html_greater, Toks[10].getKind());

ASSERT_EQ(tok::newline, Toks[11].getKind());

ASSERT_EQ(tok::text, Toks[12].getKind());
ASSERT_EQ(StringRef(" "), Toks[12].getText());

ASSERT_EQ(tok::newline, Toks[13].getKind());

ASSERT_EQ(tok::newline, Toks[14].getKind());
}

TEST_F(CommentLexerTest, HTML22) {
const char *Source = "/**\n"
" * <a\n"
" */";

std::vector<Token> Toks;

lexString(Source, Toks);

ASSERT_EQ(6U, Toks.size());

ASSERT_EQ(tok::newline, Toks[0].getKind());

ASSERT_EQ(tok::text, Toks[1].getKind());
ASSERT_EQ(StringRef(" "), Toks[1].getText());

ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());

ASSERT_EQ(tok::newline, Toks[3].getKind());

ASSERT_EQ(tok::newline, Toks[4].getKind());

ASSERT_EQ(tok::newline, Toks[5].getKind());
}

TEST_F(CommentLexerTest, HTML23) {
// NOTE: "//<" is considered a comment start
const char *Source = "// <\n"
"// a\n"
"// >";

std::vector<Token> Toks;

lexString(Source, Toks);

ASSERT_EQ(7U, Toks.size());

ASSERT_EQ(tok::text, Toks[0].getKind());
ASSERT_EQ(StringRef(" "), Toks[0].getText());

ASSERT_EQ(tok::text, Toks[1].getKind());
ASSERT_EQ(StringRef("<"), Toks[1].getText());

ASSERT_EQ(tok::newline, Toks[2].getKind());

ASSERT_EQ(tok::text, Toks[3].getKind());
ASSERT_EQ(StringRef(" a"), Toks[3].getText());

ASSERT_EQ(tok::newline, Toks[4].getKind());

ASSERT_EQ(tok::text, Toks[5].getKind());
ASSERT_EQ(StringRef(" >"), Toks[5].getText());

ASSERT_EQ(tok::newline, Toks[6].getKind());
}

TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
const char *Source = "// <tag>";

Expand Down
23 changes: 12 additions & 11 deletions clang/unittests/AST/CommentParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {

TEST_F(CommentParserTest, HTML1) {
const char *Sources[] = {
"// <a",
"// <a>",
"// <a >"
"// <a",
"// <a>",
"// <a >",
"// <a\n// >",
};

for (size_t i = 0, e = std::size(Sources); i != e; i++) {
Expand All @@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {

TEST_F(CommentParserTest, HTML2) {
const char *Sources[] = {
"// <br/>",
"// <br />"
"// <br/>",
"// <br />",
"// <br \n// />",
};

for (size_t i = 0, e = std::size(Sources); i != e; i++) {
Expand All @@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {

TEST_F(CommentParserTest, HTML3) {
const char *Sources[] = {
"// <a href",
"// <a href ",
"// <a href>",
"// <a href >",
"// <a href", "// <a href ", "// <a href>",
"// <a href >", "// <a \n// href >",
};

for (size_t i = 0, e = std::size(Sources); i != e; i++) {
Expand All @@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {

TEST_F(CommentParserTest, HTML4) {
const char *Sources[] = {
"// <a href=\"bbb\"",
"// <a href=\"bbb\">",
"// <a href=\"bbb\"",
"// <a href=\"bbb\">",
"// <a \n// href=\"bbb\">",
};

for (size_t i = 0, e = std::size(Sources); i != e; i++) {
Expand Down
Loading