Skip to content

Commit 7d669b7

Browse files
authored
[Clang][Comments] Allow HTML tags across multiple lines (llvm#120843)
HTML starting tags that span multiple lines were previously not allowed (or rather, only the starting line was lexed as HTML). Doxygen allows those tags. This PR allows the starting tags to span multiple lines. They can't span multiple (C-)Comments, though (it's likely a user-error). Multiple BCPL comments are fine as those are single lines (shown below). Example: ```c /// <a /// href="foo" /// >Aaa</a>b int Test; ``` Fixes llvm#28321.
1 parent bcfd9f8 commit 7d669b7

File tree

5 files changed

+244
-17
lines changed

5 files changed

+244
-17
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ Bug Fixes to AST Handling
149149
Miscellaneous Bug Fixes
150150
^^^^^^^^^^^^^^^^^^^^^^^
151151

152+
- HTML tags in comments that span multiple lines are now parsed correctly by Clang's comment parser. (#GH120843)
153+
152154
Miscellaneous Clang Crashes Fixed
153155
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
154156

clang/lib/AST/CommentLexer.cpp

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
196196
return BufferEnd;
197197
}
198198

199+
const char *skipHorizontalWhitespace(const char *BufferPtr,
200+
const char *BufferEnd) {
201+
for (; BufferPtr != BufferEnd; ++BufferPtr) {
202+
if (!isHorizontalWhitespace(*BufferPtr))
203+
return BufferPtr;
204+
}
205+
return BufferEnd;
206+
}
207+
199208
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200209
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201210
}
@@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
637646
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638647
T.setHTMLTagStartName(Name);
639648

640-
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
649+
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
650+
if (BufferPtr == CommentEnd) { // in BCPL comments
651+
State = LS_HTMLStartTag;
652+
return;
653+
}
641654

642655
const char C = *BufferPtr;
643656
if (BufferPtr != CommentEnd &&
644-
(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
657+
(C == '>' || C == '/' || isVerticalWhitespace(C) ||
658+
isHTMLIdentifierStartingCharacter(C)))
645659
State = LS_HTMLStartTag;
646660
}
647661

648662
void Lexer::lexHTMLStartTag(Token &T) {
649663
assert(State == LS_HTMLStartTag);
650664

665+
// Skip leading whitespace and comment decorations
666+
while (isVerticalWhitespace(*BufferPtr)) {
667+
BufferPtr = skipNewline(BufferPtr, CommentEnd);
668+
669+
if (CommentState == LCS_InsideCComment)
670+
skipLineStartingDecorations();
671+
672+
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
673+
if (BufferPtr == CommentEnd) {
674+
// HTML starting tags must be defined in a single comment block.
675+
// It's likely a user-error where they forgot to terminate the comment.
676+
State = LS_Normal;
677+
// Since at least one newline was skipped and one token needs to be lexed,
678+
// return a newline.
679+
formTokenWithChars(T, BufferPtr, tok::newline);
680+
return;
681+
}
682+
}
683+
651684
const char *TokenPtr = BufferPtr;
652685
char C = *TokenPtr;
653686
if (isHTMLIdentifierCharacter(C)) {
@@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {
693726

694727
// Now look ahead and return to normal state if we don't see any HTML tokens
695728
// ahead.
696-
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
729+
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
697730
if (BufferPtr == CommentEnd) {
698-
State = LS_Normal;
699731
return;
700732
}
701733

702734
C = *BufferPtr;
703-
if (!isHTMLIdentifierStartingCharacter(C) &&
735+
if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
704736
C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705737
State = LS_Normal;
706738
return;
@@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
774806
BufferPtr++;
775807

776808
CommentState = LCS_InsideBCPLComment;
777-
if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
809+
switch (State) {
810+
case LS_VerbatimBlockFirstLine:
811+
case LS_VerbatimBlockBody:
812+
break;
813+
case LS_HTMLStartTag:
814+
BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
815+
break;
816+
default:
778817
State = LS_Normal;
818+
break;
819+
}
779820
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780821
goto again;
781822
}
@@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
807848
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808849
EndWhitespace++;
809850

851+
// When lexing the start of an HTML tag (i.e. going through the attributes)
852+
// there won't be any newlines generated.
853+
if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
854+
CommentState = LCS_BeforeComment;
855+
BufferPtr = EndWhitespace;
856+
goto again;
857+
}
858+
810859
// Turn any whitespace between comments (and there is only whitespace
811860
// between them -- guaranteed by comment extraction) into a newline. We
812861
// have two newlines between C comments in total (first one was synthesized
@@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
829878
BufferPtr += 2;
830879
assert(BufferPtr <= BufferEnd);
831880

881+
// When lexing the start of an HTML tag (i.e. going through the
882+
// attributes) there won't be any newlines generated - whitespace still
883+
// needs to be skipped.
884+
if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
885+
CommentState = LCS_BetweenComments;
886+
goto again;
887+
}
888+
832889
// Synthenize newline just after the C comment, regardless if there is
833890
// actually a newline.
834891
formTokenWithChars(T, BufferPtr, tok::newline);

clang/test/AST/ast-dump-comment.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,19 @@ int Test_HTMLTagComment;
9191
// CHECK-NEXT: TextComment{{.*}} Text=" "
9292
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="br" SelfClosing
9393

94+
/// <a
95+
/// href="foo"
96+
/// >Aaa</a>b
97+
int Test_HTMLTagMultilineBCPL;
98+
// CHECK: VarDecl{{.*}}Test_HTMLTagMultilineBCPL
99+
// CHECK-NEXT: FullComment
100+
// CHECK-NEXT: ParagraphComment
101+
// CHECK-NEXT: TextComment{{.*}} Text=" "
102+
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="a" Attrs: "href="foo"
103+
// CHECK-NEXT: TextComment{{.*}} Text="Aaa"
104+
// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="a"
105+
// CHECK-NEXT: TextComment{{.*}} Text="b"
106+
94107
/// \verbatim
95108
/// Aaa
96109
/// \endverbatim

clang/unittests/AST/CommentLexer.cpp

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,6 +1453,160 @@ TEST_F(CommentLexerTest, HTML19) {
14531453
ASSERT_EQ(tok::newline, Toks[2].getKind());
14541454
}
14551455

1456+
TEST_F(CommentLexerTest, HTML20) {
1457+
const char *Source = "// <a\n"
1458+
"// \n"
1459+
"// href=\"foo\"\n"
1460+
"// \n"
1461+
"// bar>text</a>";
1462+
1463+
std::vector<Token> Toks;
1464+
1465+
lexString(Source, Toks);
1466+
1467+
ASSERT_EQ(11U, Toks.size());
1468+
1469+
ASSERT_EQ(tok::text, Toks[0].getKind());
1470+
ASSERT_EQ(StringRef(" "), Toks[0].getText());
1471+
1472+
ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
1473+
ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());
1474+
1475+
ASSERT_EQ(tok::html_ident, Toks[2].getKind());
1476+
ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());
1477+
1478+
ASSERT_EQ(tok::html_equals, Toks[3].getKind());
1479+
1480+
ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
1481+
ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());
1482+
1483+
ASSERT_EQ(tok::html_ident, Toks[5].getKind());
1484+
ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());
1485+
1486+
ASSERT_EQ(tok::html_greater, Toks[6].getKind());
1487+
1488+
ASSERT_EQ(tok::text, Toks[7].getKind());
1489+
ASSERT_EQ(StringRef("text"), Toks[7].getText());
1490+
1491+
ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
1492+
ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());
1493+
1494+
ASSERT_EQ(tok::html_greater, Toks[9].getKind());
1495+
1496+
ASSERT_EQ(tok::newline, Toks[10].getKind());
1497+
}
1498+
1499+
TEST_F(CommentLexerTest, HTML21) {
1500+
const char *Source = "/**\n"
1501+
" * <a\n"
1502+
" * \n"
1503+
" * href=\"foo\"\n"
1504+
" * \n"
1505+
" * bar>text</a>\n"
1506+
" */";
1507+
1508+
std::vector<Token> Toks;
1509+
1510+
lexString(Source, Toks);
1511+
1512+
ASSERT_EQ(15U, Toks.size());
1513+
1514+
ASSERT_EQ(tok::newline, Toks[0].getKind());
1515+
1516+
ASSERT_EQ(tok::text, Toks[1].getKind());
1517+
ASSERT_EQ(StringRef(" "), Toks[1].getText());
1518+
1519+
ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
1520+
ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
1521+
1522+
ASSERT_EQ(tok::html_ident, Toks[3].getKind());
1523+
ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());
1524+
1525+
ASSERT_EQ(tok::html_equals, Toks[4].getKind());
1526+
1527+
ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
1528+
ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());
1529+
1530+
ASSERT_EQ(tok::html_ident, Toks[6].getKind());
1531+
ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());
1532+
1533+
ASSERT_EQ(tok::html_greater, Toks[7].getKind());
1534+
1535+
ASSERT_EQ(tok::text, Toks[8].getKind());
1536+
ASSERT_EQ(StringRef("text"), Toks[8].getText());
1537+
1538+
ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
1539+
ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());
1540+
1541+
ASSERT_EQ(tok::html_greater, Toks[10].getKind());
1542+
1543+
ASSERT_EQ(tok::newline, Toks[11].getKind());
1544+
1545+
ASSERT_EQ(tok::text, Toks[12].getKind());
1546+
ASSERT_EQ(StringRef(" "), Toks[12].getText());
1547+
1548+
ASSERT_EQ(tok::newline, Toks[13].getKind());
1549+
1550+
ASSERT_EQ(tok::newline, Toks[14].getKind());
1551+
}
1552+
1553+
TEST_F(CommentLexerTest, HTML22) {
1554+
const char *Source = "/**\n"
1555+
" * <a\n"
1556+
" */";
1557+
1558+
std::vector<Token> Toks;
1559+
1560+
lexString(Source, Toks);
1561+
1562+
ASSERT_EQ(6U, Toks.size());
1563+
1564+
ASSERT_EQ(tok::newline, Toks[0].getKind());
1565+
1566+
ASSERT_EQ(tok::text, Toks[1].getKind());
1567+
ASSERT_EQ(StringRef(" "), Toks[1].getText());
1568+
1569+
ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
1570+
ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
1571+
1572+
ASSERT_EQ(tok::newline, Toks[3].getKind());
1573+
1574+
ASSERT_EQ(tok::newline, Toks[4].getKind());
1575+
1576+
ASSERT_EQ(tok::newline, Toks[5].getKind());
1577+
}
1578+
1579+
TEST_F(CommentLexerTest, HTML23) {
1580+
// NOTE: "//<" is considered a comment start
1581+
const char *Source = "// <\n"
1582+
"// a\n"
1583+
"// >";
1584+
1585+
std::vector<Token> Toks;
1586+
1587+
lexString(Source, Toks);
1588+
1589+
ASSERT_EQ(7U, Toks.size());
1590+
1591+
ASSERT_EQ(tok::text, Toks[0].getKind());
1592+
ASSERT_EQ(StringRef(" "), Toks[0].getText());
1593+
1594+
ASSERT_EQ(tok::text, Toks[1].getKind());
1595+
ASSERT_EQ(StringRef("<"), Toks[1].getText());
1596+
1597+
ASSERT_EQ(tok::newline, Toks[2].getKind());
1598+
1599+
ASSERT_EQ(tok::text, Toks[3].getKind());
1600+
ASSERT_EQ(StringRef(" a"), Toks[3].getText());
1601+
1602+
ASSERT_EQ(tok::newline, Toks[4].getKind());
1603+
1604+
ASSERT_EQ(tok::text, Toks[5].getKind());
1605+
ASSERT_EQ(StringRef(" >"), Toks[5].getText());
1606+
1607+
ASSERT_EQ(tok::newline, Toks[6].getKind());
1608+
}
1609+
14561610
TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
14571611
const char *Source = "// <tag>";
14581612

clang/unittests/AST/CommentParser.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {
10651065

10661066
TEST_F(CommentParserTest, HTML1) {
10671067
const char *Sources[] = {
1068-
"// <a",
1069-
"// <a>",
1070-
"// <a >"
1068+
"// <a",
1069+
"// <a>",
1070+
"// <a >",
1071+
"// <a\n// >",
10711072
};
10721073

10731074
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {
10881089

10891090
TEST_F(CommentParserTest, HTML2) {
10901091
const char *Sources[] = {
1091-
"// <br/>",
1092-
"// <br />"
1092+
"// <br/>",
1093+
"// <br />",
1094+
"// <br \n// />",
10931095
};
10941096

10951097
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {
11101112

11111113
TEST_F(CommentParserTest, HTML3) {
11121114
const char *Sources[] = {
1113-
"// <a href",
1114-
"// <a href ",
1115-
"// <a href>",
1116-
"// <a href >",
1115+
"// <a href", "// <a href ", "// <a href>",
1116+
"// <a href >", "// <a \n// href >",
11171117
};
11181118

11191119
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {
11341134

11351135
TEST_F(CommentParserTest, HTML4) {
11361136
const char *Sources[] = {
1137-
"// <a href=\"bbb\"",
1138-
"// <a href=\"bbb\">",
1137+
"// <a href=\"bbb\"",
1138+
"// <a href=\"bbb\">",
1139+
"// <a \n// href=\"bbb\">",
11391140
};
11401141

11411142
for (size_t i = 0, e = std::size(Sources); i != e; i++) {

0 commit comments

Comments
 (0)