Skip to content

Commit 589388b

Browse files
committed
[Clang][Comments] Allow HTML tags across multiple lines
1 parent 5b5b241 commit 589388b

File tree

4 files changed

+211
-17
lines changed

4 files changed

+211
-17
lines changed

clang/lib/AST/CommentLexer.cpp

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
196196
return BufferEnd;
197197
}
198198

199+
const char *skipHorizontalWhitespace(const char *BufferPtr,
200+
const char *BufferEnd) {
201+
for (; BufferPtr != BufferEnd; ++BufferPtr) {
202+
if (!isHorizontalWhitespace(*BufferPtr))
203+
return BufferPtr;
204+
}
205+
return BufferEnd;
206+
}
207+
199208
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200209
return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201210
}
@@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
637646
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638647
T.setHTMLTagStartName(Name);
639648

640-
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
649+
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
650+
if (BufferPtr == CommentEnd) { // in BCPL comments
651+
State = LS_HTMLStartTag;
652+
return;
653+
}
641654

642655
const char C = *BufferPtr;
643656
if (BufferPtr != CommentEnd &&
644-
(C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
657+
(C == '>' || C == '/' || isVerticalWhitespace(C) ||
658+
isHTMLIdentifierStartingCharacter(C)))
645659
State = LS_HTMLStartTag;
646660
}
647661

648662
void Lexer::lexHTMLStartTag(Token &T) {
649663
assert(State == LS_HTMLStartTag);
650664

665+
// Skip leading whitespace and comment decorations
666+
while (isVerticalWhitespace(*BufferPtr)) {
667+
BufferPtr = skipNewline(BufferPtr, CommentEnd);
668+
669+
if (CommentState == LCS_InsideCComment)
670+
skipLineStartingDecorations();
671+
672+
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
673+
if (BufferPtr == CommentEnd) {
674+
// HTML starting tags must be defined in a single comment block.
675+
// It's likely a user-error where they forgot to terminate the comment.
676+
State = LS_Normal;
677+
// Since at least one newline was skipped and one token needs to be lexed,
678+
// return a newline.
679+
formTokenWithChars(T, BufferPtr, tok::newline);
680+
return;
681+
}
682+
}
683+
651684
const char *TokenPtr = BufferPtr;
652685
char C = *TokenPtr;
653686
if (isHTMLIdentifierCharacter(C)) {
@@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {
693726

694727
// Now look ahead and return to normal state if we don't see any HTML tokens
695728
// ahead.
696-
BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
729+
BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
697730
if (BufferPtr == CommentEnd) {
698-
State = LS_Normal;
699731
return;
700732
}
701733

702734
C = *BufferPtr;
703-
if (!isHTMLIdentifierStartingCharacter(C) &&
735+
if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
704736
C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705737
State = LS_Normal;
706738
return;
@@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
774806
BufferPtr++;
775807

776808
CommentState = LCS_InsideBCPLComment;
777-
if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
809+
switch (State) {
810+
case LS_VerbatimBlockFirstLine:
811+
case LS_VerbatimBlockBody:
812+
break;
813+
case LS_HTMLStartTag:
814+
BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
815+
break;
816+
default:
778817
State = LS_Normal;
818+
break;
819+
}
779820
CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780821
goto again;
781822
}
@@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
807848
while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808849
EndWhitespace++;
809850

851+
// When lexing the start of an HTML tag (i.e. going through the attributes)
852+
// there won't be any newlines generated.
853+
if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
854+
CommentState = LCS_BeforeComment;
855+
BufferPtr = EndWhitespace;
856+
goto again;
857+
}
858+
810859
// Turn any whitespace between comments (and there is only whitespace
811860
// between them -- guaranteed by comment extraction) into a newline. We
812861
// have two newlines between C comments in total (first one was synthesized
@@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
829878
BufferPtr += 2;
830879
assert(BufferPtr <= BufferEnd);
831880

881+
// When lexing the start of an HTML tag (i.e. going through the
882+
// attributes) there won't be any newlines generated - whitespace still
883+
// needs to be skipped.
884+
if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
885+
CommentState = LCS_BetweenComments;
886+
goto again;
887+
}
888+
832889
// Synthenize newline just after the C comment, regardless if there is
833890
// actually a newline.
834891
formTokenWithChars(T, BufferPtr, tok::newline);

clang/test/AST/ast-dump-comment.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,19 @@ int Test_HTMLTagComment;
9191
// CHECK-NEXT: TextComment{{.*}} Text=" "
9292
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="br" SelfClosing
9393

94+
/// <a
95+
/// href="foo"
96+
/// >Aaa</a>b
97+
int Test_HTMLTagMultilineBCPL;
98+
// CHECK: VarDecl{{.*}}Test_HTMLTagMultilineBCPL
99+
// CHECK-NEXT: FullComment
100+
// CHECK-NEXT: ParagraphComment
101+
// CHECK-NEXT: TextComment{{.*}} Text=" "
102+
// CHECK-NEXT: HTMLStartTagComment{{.*}} Name="a" Attrs: "href="foo"
103+
// CHECK-NEXT: TextComment{{.*}} Text="Aaa"
104+
// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="a"
105+
// CHECK-NEXT: TextComment{{.*}} Text="b"
106+
94107
/// \verbatim
95108
/// Aaa
96109
/// \endverbatim

clang/unittests/AST/CommentLexer.cpp

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,6 +1453,129 @@ TEST_F(CommentLexerTest, HTML19) {
14531453
ASSERT_EQ(tok::newline, Toks[2].getKind());
14541454
}
14551455

1456+
TEST_F(CommentLexerTest, HTML20) {
1457+
const char *Source = "// <a\n"
1458+
"// \n"
1459+
"// href=\"foo\"\n"
1460+
"// \n"
1461+
"// bar>text</a>";
1462+
1463+
std::vector<Token> Toks;
1464+
1465+
lexString(Source, Toks);
1466+
1467+
ASSERT_EQ(11U, Toks.size());
1468+
1469+
ASSERT_EQ(tok::text, Toks[0].getKind());
1470+
ASSERT_EQ(StringRef(" "), Toks[0].getText());
1471+
1472+
ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
1473+
ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());
1474+
1475+
ASSERT_EQ(tok::html_ident, Toks[2].getKind());
1476+
ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());
1477+
1478+
ASSERT_EQ(tok::html_equals, Toks[3].getKind());
1479+
1480+
ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
1481+
ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());
1482+
1483+
ASSERT_EQ(tok::html_ident, Toks[5].getKind());
1484+
ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());
1485+
1486+
ASSERT_EQ(tok::html_greater, Toks[6].getKind());
1487+
1488+
ASSERT_EQ(tok::text, Toks[7].getKind());
1489+
ASSERT_EQ(StringRef("text"), Toks[7].getText());
1490+
1491+
ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
1492+
ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());
1493+
1494+
ASSERT_EQ(tok::html_greater, Toks[9].getKind());
1495+
1496+
ASSERT_EQ(tok::newline, Toks[10].getKind());
1497+
}
1498+
1499+
TEST_F(CommentLexerTest, HTML21) {
1500+
const char *Source = "/**\n"
1501+
" * <a\n"
1502+
" * \n"
1503+
" * href=\"foo\"\n"
1504+
" * \n"
1505+
" * bar>text</a>\n"
1506+
" */";
1507+
1508+
std::vector<Token> Toks;
1509+
1510+
lexString(Source, Toks);
1511+
1512+
ASSERT_EQ(15U, Toks.size());
1513+
1514+
ASSERT_EQ(tok::newline, Toks[0].getKind());
1515+
1516+
ASSERT_EQ(tok::text, Toks[1].getKind());
1517+
ASSERT_EQ(StringRef(" "), Toks[1].getText());
1518+
1519+
ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
1520+
ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
1521+
1522+
ASSERT_EQ(tok::html_ident, Toks[3].getKind());
1523+
ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());
1524+
1525+
ASSERT_EQ(tok::html_equals, Toks[4].getKind());
1526+
1527+
ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
1528+
ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());
1529+
1530+
ASSERT_EQ(tok::html_ident, Toks[6].getKind());
1531+
ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());
1532+
1533+
ASSERT_EQ(tok::html_greater, Toks[7].getKind());
1534+
1535+
ASSERT_EQ(tok::text, Toks[8].getKind());
1536+
ASSERT_EQ(StringRef("text"), Toks[8].getText());
1537+
1538+
ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
1539+
ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());
1540+
1541+
ASSERT_EQ(tok::html_greater, Toks[10].getKind());
1542+
1543+
ASSERT_EQ(tok::newline, Toks[11].getKind());
1544+
1545+
ASSERT_EQ(tok::text, Toks[12].getKind());
1546+
ASSERT_EQ(StringRef(" "), Toks[12].getText());
1547+
1548+
ASSERT_EQ(tok::newline, Toks[13].getKind());
1549+
1550+
ASSERT_EQ(tok::newline, Toks[14].getKind());
1551+
}
1552+
1553+
TEST_F(CommentLexerTest, HTML22) {
1554+
const char *Source = "/**\n"
1555+
" * <a\n"
1556+
" */";
1557+
1558+
std::vector<Token> Toks;
1559+
1560+
lexString(Source, Toks);
1561+
1562+
ASSERT_EQ(6U, Toks.size());
1563+
1564+
ASSERT_EQ(tok::newline, Toks[0].getKind());
1565+
1566+
ASSERT_EQ(tok::text, Toks[1].getKind());
1567+
ASSERT_EQ(StringRef(" "), Toks[1].getText());
1568+
1569+
ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
1570+
ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
1571+
1572+
ASSERT_EQ(tok::newline, Toks[3].getKind());
1573+
1574+
ASSERT_EQ(tok::newline, Toks[4].getKind());
1575+
1576+
ASSERT_EQ(tok::newline, Toks[5].getKind());
1577+
}
1578+
14561579
TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
14571580
const char *Source = "// <tag>";
14581581

clang/unittests/AST/CommentParser.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {
10651065

10661066
TEST_F(CommentParserTest, HTML1) {
10671067
const char *Sources[] = {
1068-
"// <a",
1069-
"// <a>",
1070-
"// <a >"
1068+
"// <a",
1069+
"// <a>",
1070+
"// <a >",
1071+
"// <a\n// >",
10711072
};
10721073

10731074
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {
10881089

10891090
TEST_F(CommentParserTest, HTML2) {
10901091
const char *Sources[] = {
1091-
"// <br/>",
1092-
"// <br />"
1092+
"// <br/>",
1093+
"// <br />",
1094+
"// <br \n// />",
10931095
};
10941096

10951097
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {
11101112

11111113
TEST_F(CommentParserTest, HTML3) {
11121114
const char *Sources[] = {
1113-
"// <a href",
1114-
"// <a href ",
1115-
"// <a href>",
1116-
"// <a href >",
1115+
"// <a href", "// <a href ", "// <a href>",
1116+
"// <a href >", "// <a \n// href >",
11171117
};
11181118

11191119
for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {
11341134

11351135
TEST_F(CommentParserTest, HTML4) {
11361136
const char *Sources[] = {
1137-
"// <a href=\"bbb\"",
1138-
"// <a href=\"bbb\">",
1137+
"// <a href=\"bbb\"",
1138+
"// <a href=\"bbb\">",
1139+
"// <a \n// href=\"bbb\">",
11391140
};
11401141

11411142
for (size_t i = 0, e = std::size(Sources); i != e; i++) {

0 commit comments

Comments
 (0)