[Clang][Comments] Allow HTML tags across multiple lines (llvm#120843)

Nerixyz · web-flow · commit 7d669b7c25e1 · 2025-02-05T08:46:14.000-05:00
HTML starting tags that span multiple lines were previously not allowed (or rather, only the starting line was lexed as HTML). Doxygen allows those tags. This PR allows the starting tags to span multiple lines. They can't span multiple (C-)Comments, though (it's likely a user-error). Multiple BCPL comments are fine as those are single lines (shown below). Example: ```c /// <a /// href="foo" /// >Aaa</a>b int Test; ``` Fixes llvm#28321.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
@@ -149,6 +149,8 @@ Bug Fixes to AST Handling
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- HTML tags in comments that span multiple lines are now parsed correctly by Clang's comment parser. (#GH120843)
+
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp
@@ -196,6 +196,15 @@ const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
   return BufferEnd;
 }
 
+const char *skipHorizontalWhitespace(const char *BufferPtr,
+                                     const char *BufferEnd) {
+  for (; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHorizontalWhitespace(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
 }
@@ -637,17 +646,41 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) {
   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
   T.setHTMLTagStartName(Name);
 
-  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+  if (BufferPtr == CommentEnd) { // in BCPL comments
+    State = LS_HTMLStartTag;
+    return;
+  }
 
   const char C = *BufferPtr;
   if (BufferPtr != CommentEnd &&
-      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
+      (C == '>' || C == '/' || isVerticalWhitespace(C) ||
+       isHTMLIdentifierStartingCharacter(C)))
     State = LS_HTMLStartTag;
 }
 
 void Lexer::lexHTMLStartTag(Token &T) {
   assert(State == LS_HTMLStartTag);
 
+  // Skip leading whitespace and comment decorations
+  while (isVerticalWhitespace(*BufferPtr)) {
+    BufferPtr = skipNewline(BufferPtr, CommentEnd);
+
+    if (CommentState == LCS_InsideCComment)
+      skipLineStartingDecorations();
+
+    BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
+    if (BufferPtr == CommentEnd) {
+      // HTML starting tags must be defined in a single comment block.
+      // It's likely a user-error where they forgot to terminate the comment.
+      State = LS_Normal;
+      // Since at least one newline was skipped and one token needs to be lexed,
+      // return a newline.
+      formTokenWithChars(T, BufferPtr, tok::newline);
+      return;
+    }
+  }
+
   const char *TokenPtr = BufferPtr;
   char C = *TokenPtr;
   if (isHTMLIdentifierCharacter(C)) {
@@ -693,14 +726,13 @@ void Lexer::lexHTMLStartTag(Token &T) {
 
   // Now look ahead and return to normal state if we don't see any HTML tokens
   // ahead.
-  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
+  BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
   if (BufferPtr == CommentEnd) {
-    State = LS_Normal;
     return;
   }
 
   C = *BufferPtr;
-  if (!isHTMLIdentifierStartingCharacter(C) &&
+  if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
     State = LS_Normal;
     return;
@@ -774,8 +806,17 @@ void Lexer::lex(Token &T) {
         BufferPtr++;
 
       CommentState = LCS_InsideBCPLComment;
-      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
+      switch (State) {
+      case LS_VerbatimBlockFirstLine:
+      case LS_VerbatimBlockBody:
+        break;
+      case LS_HTMLStartTag:
+        BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
+        break;
+      default:
         State = LS_Normal;
+        break;
+      }
       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
       goto again;
     }
@@ -807,6 +848,14 @@ void Lexer::lex(Token &T) {
     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
       EndWhitespace++;
 
+    // When lexing the start of an HTML tag (i.e. going through the attributes)
+    // there won't be any newlines generated.
+    if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
+      CommentState = LCS_BeforeComment;
+      BufferPtr = EndWhitespace;
+      goto again;
+    }
+
     // Turn any whitespace between comments (and there is only whitespace
     // between them -- guaranteed by comment extraction) into a newline.  We
     // have two newlines between C comments in total (first one was synthesized
@@ -829,6 +878,14 @@ void Lexer::lex(Token &T) {
         BufferPtr += 2;
         assert(BufferPtr <= BufferEnd);
 
+        // When lexing the start of an HTML tag (i.e. going through the
+        // attributes) there won't be any newlines generated - whitespace still
+        // needs to be skipped.
+        if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
+          CommentState = LCS_BetweenComments;
+          goto again;
+        }
+
         // Synthenize newline just after the C comment, regardless if there is
         // actually a newline.
         formTokenWithChars(T, BufferPtr, tok::newline);
diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp
@@ -91,6 +91,19 @@ int Test_HTMLTagComment;
 // CHECK-NEXT:       TextComment{{.*}} Text=" "
 // CHECK-NEXT:       HTMLStartTagComment{{.*}} Name="br" SelfClosing
 
+/// <a
+///     href="foo"
+/// >Aaa</a>b
+int Test_HTMLTagMultilineBCPL;
+// CHECK:      VarDecl{{.*}}Test_HTMLTagMultilineBCPL
+// CHECK-NEXT:   FullComment
+// CHECK-NEXT:     ParagraphComment
+// CHECK-NEXT:       TextComment{{.*}} Text=" "
+// CHECK-NEXT:       HTMLStartTagComment{{.*}} Name="a" Attrs:  "href="foo"
+// CHECK-NEXT:       TextComment{{.*}} Text="Aaa"
+// CHECK-NEXT:       HTMLEndTagComment{{.*}} Name="a"
+// CHECK-NEXT:       TextComment{{.*}} Text="b"
+
 /// \verbatim
 /// Aaa
 /// \endverbatim
diff --git a/clang/unittests/AST/CommentLexer.cpp b/clang/unittests/AST/CommentLexer.cpp
@@ -1453,6 +1453,160 @@ TEST_F(CommentLexerTest, HTML19) {
   ASSERT_EQ(tok::newline,      Toks[2].getKind());
 }
 
+TEST_F(CommentLexerTest, HTML20) {
+  const char *Source = "// <a\n"
+                       "// \n"
+                       "// href=\"foo\"\n"
+                       "// \n"
+                       "// bar>text</a>";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(11U, Toks.size());
+
+  ASSERT_EQ(tok::text, Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[1].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[1].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::html_ident, Toks[2].getKind());
+  ASSERT_EQ(StringRef("href"), Toks[2].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals, Toks[3].getKind());
+
+  ASSERT_EQ(tok::html_quoted_string, Toks[4].getKind());
+  ASSERT_EQ(StringRef("foo"), Toks[4].getHTMLQuotedString());
+
+  ASSERT_EQ(tok::html_ident, Toks[5].getKind());
+  ASSERT_EQ(StringRef("bar"), Toks[5].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_greater, Toks[6].getKind());
+
+  ASSERT_EQ(tok::text, Toks[7].getKind());
+  ASSERT_EQ(StringRef("text"), Toks[7].getText());
+
+  ASSERT_EQ(tok::html_end_tag, Toks[8].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[8].getHTMLTagEndName());
+
+  ASSERT_EQ(tok::html_greater, Toks[9].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[10].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML21) {
+  const char *Source = "/**\n"
+                       " * <a\n"
+                       " * \n"
+                       " * href=\"foo\"\n"
+                       " * \n"
+                       " * bar>text</a>\n"
+                       " */";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(15U, Toks.size());
+
+  ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::html_ident, Toks[3].getKind());
+  ASSERT_EQ(StringRef("href"), Toks[3].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_equals, Toks[4].getKind());
+
+  ASSERT_EQ(tok::html_quoted_string, Toks[5].getKind());
+  ASSERT_EQ(StringRef("foo"), Toks[5].getHTMLQuotedString());
+
+  ASSERT_EQ(tok::html_ident, Toks[6].getKind());
+  ASSERT_EQ(StringRef("bar"), Toks[6].getHTMLIdent());
+
+  ASSERT_EQ(tok::html_greater, Toks[7].getKind());
+
+  ASSERT_EQ(tok::text, Toks[8].getKind());
+  ASSERT_EQ(StringRef("text"), Toks[8].getText());
+
+  ASSERT_EQ(tok::html_end_tag, Toks[9].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[9].getHTMLTagEndName());
+
+  ASSERT_EQ(tok::html_greater, Toks[10].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[11].getKind());
+
+  ASSERT_EQ(tok::text, Toks[12].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[12].getText());
+
+  ASSERT_EQ(tok::newline, Toks[13].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[14].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML22) {
+  const char *Source = "/**\n"
+                       " * <a\n"
+                       " */";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(6U, Toks.size());
+
+  ASSERT_EQ(tok::newline, Toks[0].getKind());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[1].getText());
+
+  ASSERT_EQ(tok::html_start_tag, Toks[2].getKind());
+  ASSERT_EQ(StringRef("a"), Toks[2].getHTMLTagStartName());
+
+  ASSERT_EQ(tok::newline, Toks[3].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+  ASSERT_EQ(tok::newline, Toks[5].getKind());
+}
+
+TEST_F(CommentLexerTest, HTML23) {
+  // NOTE: "//<" is considered a comment start
+  const char *Source = "// <\n"
+                       "// a\n"
+                       "// >";
+
+  std::vector<Token> Toks;
+
+  lexString(Source, Toks);
+
+  ASSERT_EQ(7U, Toks.size());
+
+  ASSERT_EQ(tok::text, Toks[0].getKind());
+  ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+  ASSERT_EQ(tok::text, Toks[1].getKind());
+  ASSERT_EQ(StringRef("<"), Toks[1].getText());
+
+  ASSERT_EQ(tok::newline, Toks[2].getKind());
+
+  ASSERT_EQ(tok::text, Toks[3].getKind());
+  ASSERT_EQ(StringRef(" a"), Toks[3].getText());
+
+  ASSERT_EQ(tok::newline, Toks[4].getKind());
+
+  ASSERT_EQ(tok::text, Toks[5].getKind());
+  ASSERT_EQ(StringRef(" >"), Toks[5].getText());
+
+  ASSERT_EQ(tok::newline, Toks[6].getKind());
+}
+
 TEST_F(CommentLexerTest, NotAKnownHTMLTag1) {
   const char *Source = "// <tag>";
 
diff --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp
@@ -1065,9 +1065,10 @@ TEST_F(CommentParserTest, InlineCommand5) {
 
 TEST_F(CommentParserTest, HTML1) {
   const char *Sources[] = {
-    "// <a",
-    "// <a>",
-    "// <a >"
+      "// <a",
+      "// <a>",
+      "// <a >",
+      "// <a\n// >",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1088,8 +1089,9 @@ TEST_F(CommentParserTest, HTML1) {
 
 TEST_F(CommentParserTest, HTML2) {
   const char *Sources[] = {
-    "// <br/>",
-    "// <br />"
+      "// <br/>",
+      "// <br />",
+      "// <br \n// />",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1110,10 +1112,8 @@ TEST_F(CommentParserTest, HTML2) {
 
 TEST_F(CommentParserTest, HTML3) {
   const char *Sources[] = {
-    "// <a href",
-    "// <a href ",
-    "// <a href>",
-    "// <a href >",
+      "// <a href",   "// <a href ",       "// <a href>",
+      "// <a href >", "// <a \n// href >",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {
@@ -1134,8 +1134,9 @@ TEST_F(CommentParserTest, HTML3) {
 
 TEST_F(CommentParserTest, HTML4) {
   const char *Sources[] = {
-    "// <a href=\"bbb\"",
-    "// <a href=\"bbb\">",
+      "// <a href=\"bbb\"",
+      "// <a href=\"bbb\">",
+      "// <a \n// href=\"bbb\">",
   };
 
   for (size_t i = 0, e = std::size(Sources); i != e; i++) {