-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[demangle] Represent a char array initializer as a string literal. #109021
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This improves the demangling for non-type template arguments that contain string literals. Previously we'd produce char [4]{(char)65, (char)66, (char)67} (which isn't valid C or C++), and now we produce "ABC" The new demangling is always shorter, even when using an escape sequence for every character, and much more readable when the char array contains text.
@llvm/pr-subscribers-libcxxabi Author: Richard Smith (zygoloid) ChangesThis improves the demangling for non-type template arguments that
(which isn't valid C or C++), and now we produce The new demangling is always shorter, even when using an escape sequence Full diff: https://github.com/llvm/llvm-project/pull/109021.diff 3 Files Affected:
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 3b041efe3aac00..8032c893fbae6e 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -156,6 +156,8 @@ template <class T, size_t N> class PODSmallVector {
}
};
+class NodeArray;
+
// Base class of all AST nodes. The AST is built by the parser, then is
// traversed by the printLeft/Right functions to produce a demangled string.
class Node {
@@ -293,6 +295,13 @@ class Node {
// implementation.
virtual void printRight(OutputBuffer &) const {}
+ // Print an initializer list of this type. Returns true if we printed a custom
+ // representation, false if nothing has been printed and the default
+ // representation should be used.
+ virtual bool printInitListAsType(OutputBuffer &, const NodeArray &) const {
+ return false;
+ }
+
virtual std::string_view getBaseName() const { return {}; }
// Silence compiler warnings, this dtor will never be called.
@@ -339,6 +348,10 @@ class NodeArray {
FirstElement = false;
}
}
+
+ // Print an array of integer literals as a string literal. Returns whether we
+ // could do so.
+ bool printAsString(OutputBuffer &OB) const;
};
struct NodeArrayNode : Node {
@@ -796,6 +809,15 @@ class ArrayType final : public Node {
OB += "]";
Base->printRight(OB);
}
+
+ bool printInitListAsType(OutputBuffer &OB,
+ const NodeArray &Elements) const override {
+ if (Base->getKind() == KNameType &&
+ static_cast<const NameType *>(Base)->getName() == "char") {
+ return Elements.printAsString(OB);
+ }
+ return false;
+ }
};
class FunctionType final : public Node {
@@ -2225,8 +2247,11 @@ class InitListExpr : public Node {
template<typename Fn> void match(Fn F) const { F(Ty, Inits); }
void printLeft(OutputBuffer &OB) const override {
- if (Ty)
+ if (Ty) {
+ if (Ty->printInitListAsType(OB, Inits))
+ return;
Ty->print(OB);
+ }
OB += '{';
Inits.printWithComma(OB);
OB += '}';
@@ -2433,6 +2458,8 @@ class IntegerLiteral : public Node {
if (Type.size() <= 3)
OB += Type;
}
+
+ std::string_view value() const { return Value; }
};
class RequiresExpr : public Node {
@@ -2604,6 +2631,93 @@ template<typename NodeT> struct NodeKind;
};
#include "ItaniumNodes.def"
+bool NodeArray::printAsString(OutputBuffer &OB) const {
+ auto Fail = [&OB, StartPos = OB.getCurrentPosition()] {
+ OB.setCurrentPosition(StartPos);
+ return false;
+ };
+
+ OB += '"';
+ bool LastWasNumericEscape = false;
+ for (const Node *Element : *this) {
+ if (Element->getKind() != Node::KIntegerLiteral)
+ return Fail();
+ int integer_value = 0;
+ for (char c : static_cast<const IntegerLiteral *>(Element)->value()) {
+ if (c < '0' || c > '9' || integer_value > 25)
+ return Fail();
+ integer_value *= 10;
+ integer_value += c - '0';
+ }
+ if (integer_value > 255)
+ return Fail();
+
+ // Insert a `""` to avoid accidentally extending a numeric escape.
+ if (LastWasNumericEscape) {
+ if ((integer_value >= '0' && integer_value <= '9') ||
+ (integer_value >= 'a' && integer_value <= 'f') ||
+ (integer_value >= 'A' && integer_value <= 'F')) {
+ OB += "\"\"";
+ }
+ }
+
+ LastWasNumericEscape = false;
+
+ // Determine how to print this character.
+ switch (integer_value) {
+ case '\a':
+ OB += "\\a";
+ break;
+ case '\b':
+ OB += "\\b";
+ break;
+ case '\f':
+ OB += "\\f";
+ break;
+ case '\n':
+ OB += "\\n";
+ break;
+ case '\r':
+ OB += "\\r";
+ break;
+ case '\t':
+ OB += "\\t";
+ break;
+ case '\v':
+ OB += "\\v";
+ break;
+
+ case '"':
+ OB += "\\\"";
+ break;
+ case '\\':
+ OB += "\\\\";
+ break;
+
+ default:
+ // We assume that the character is ASCII, and use a numeric escape for all
+ // remaining non-printable ASCII characters.
+ if (integer_value < 32 || integer_value == 127) {
+ constexpr char Hex[] = "0123456789ABCDEF";
+ OB += '\\';
+ if (integer_value > 7)
+ OB += 'x';
+ if (integer_value >= 16)
+ OB += Hex[integer_value >> 4];
+ OB += Hex[integer_value & 0xF];
+ LastWasNumericEscape = true;
+ break;
+ }
+
+ // Assume all remaining characters are directly printable.
+ OB += (char)integer_value;
+ break;
+ }
+ }
+ OB += '"';
+ return true;
+}
+
template <typename Derived, typename Alloc> struct AbstractManglingParser {
const char *First;
const char *Last;
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 77f79e0d40e84f..c8d4ca8637e8da 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -30037,7 +30037,32 @@ const char* cases[][2] =
// FIXME: This is not valid pointer-to-member syntax.
{"_Z1fIXtl1DmcM7DerivedKiadL_ZN11MoreDerived1zEEn8EEEEvv", "void f<D{(int const Derived::*)(&MoreDerived::z)}>()"},
{"_Z1fIXtl1Edi1nLi42EEEEvv", "void f<E{.n = 42}>()"},
- {"_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE", "template parameter object for S{char [32]{(char)104, (char)101, (char)108, (char)108, (char)111, (char)32, (char)119, (char)111, (char)114, (char)108, (char)100}}"},
+ // Arrays of char are formatted as string literals. Escape sequences are
+ // used for non-printable ASCII characters.
+ // FIXME: We should do the same for arrays of charN_t and wchar_t.
+ {"_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE", "template parameter object for S{\"hello world\"}"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc108ELc108ELc111EEEEEvv", "void f<Hello{\"Hello\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc108ELc111EEEEEvv", "void f<Hello{\"Helo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc0ELc108ELc111EEEEEvv", "void f<Hello{\"He\\0lo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc1ELc108ELc111EEEEEvv", "void f<Hello{\"He\\1lo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc6ELc108ELc111EEEEEvv", "void f<Hello{\"He\\6lo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc7ELc108ELc111EEEEEvv", "void f<Hello{\"He\\alo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc8ELc108ELc111EEEEEvv", "void f<Hello{\"He\\blo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc9ELc108ELc111EEEEEvv", "void f<Hello{\"He\\tlo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc10ELc108ELc111EEEEEvv", "void f<Hello{\"He\\nlo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc11ELc108ELc111EEEEEvv", "void f<Hello{\"He\\vlo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc12ELc108ELc111EEEEEvv", "void f<Hello{\"He\\flo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc13ELc108ELc111EEEEEvv", "void f<Hello{\"He\\rlo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc14ELc108ELc111EEEEEvv", "void f<Hello{\"He\\xElo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc15ELc108ELc111EEEEEvv", "void f<Hello{\"He\\xFlo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc16ELc108ELc111EEEEEvv", "void f<Hello{\"He\\x10lo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc34ELc108ELc111EEEEEvv", "void f<Hello{\"He\\\"lo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc72ELc101ELc92ELc108ELc111EEEEEvv", "void f<Hello{\"He\\\\lo\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc15ELc101ELc108ELc108ELc111EEEEEvv", "void f<Hello{\"\\xF\"\"ello\"}>()"},
+ {"_Z1fIXtl5HellotlA6_cLc240ELc159ELc152ELc138ELc33EEEEEvv", "void f<Hello{\"😊!\"}>()"},
+ // Even non-null-terminated strings get this treatment, even though this
+ // isn't valid C++ syntax to initialize an array of char.
+ {"_Z1fIXtl5HellotlA5_cLc72ELc101ELc108ELc108ELc111EEEEEvv", "void f<Hello{\"Hello\"}>()"},
// FIXME: This is wrong; the S2_ backref should expand to OT_ and then to
// "double&&". But we can't cope with a substitution that represents a
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 0af0224bc83fa8..401dc4f5a4878c 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -156,6 +156,8 @@ template <class T, size_t N> class PODSmallVector {
}
};
+class NodeArray;
+
// Base class of all AST nodes. The AST is built by the parser, then is
// traversed by the printLeft/Right functions to produce a demangled string.
class Node {
@@ -293,6 +295,13 @@ class Node {
// implementation.
virtual void printRight(OutputBuffer &) const {}
+ // Print an initializer list of this type. Returns true if we printed a custom
+ // representation, false if nothing has been printed and the default
+ // representation should be used.
+ virtual bool printInitListAsType(OutputBuffer &, const NodeArray &) const {
+ return false;
+ }
+
virtual std::string_view getBaseName() const { return {}; }
// Silence compiler warnings, this dtor will never be called.
@@ -339,6 +348,10 @@ class NodeArray {
FirstElement = false;
}
}
+
+ // Print an array of integer literals as a string literal. Returns whether we
+ // could do so.
+ bool printAsString(OutputBuffer &OB) const;
};
struct NodeArrayNode : Node {
@@ -796,6 +809,15 @@ class ArrayType final : public Node {
OB += "]";
Base->printRight(OB);
}
+
+ bool printInitListAsType(OutputBuffer &OB,
+ const NodeArray &Elements) const override {
+ if (Base->getKind() == KNameType &&
+ static_cast<const NameType *>(Base)->getName() == "char") {
+ return Elements.printAsString(OB);
+ }
+ return false;
+ }
};
class FunctionType final : public Node {
@@ -2225,8 +2247,11 @@ class InitListExpr : public Node {
template<typename Fn> void match(Fn F) const { F(Ty, Inits); }
void printLeft(OutputBuffer &OB) const override {
- if (Ty)
+ if (Ty) {
+ if (Ty->printInitListAsType(OB, Inits))
+ return;
Ty->print(OB);
+ }
OB += '{';
Inits.printWithComma(OB);
OB += '}';
@@ -2433,6 +2458,8 @@ class IntegerLiteral : public Node {
if (Type.size() <= 3)
OB += Type;
}
+
+ std::string_view value() const { return Value; }
};
class RequiresExpr : public Node {
@@ -2604,6 +2631,93 @@ template<typename NodeT> struct NodeKind;
};
#include "ItaniumNodes.def"
+bool NodeArray::printAsString(OutputBuffer &OB) const {
+ auto Fail = [&OB, StartPos = OB.getCurrentPosition()] {
+ OB.setCurrentPosition(StartPos);
+ return false;
+ };
+
+ OB += '"';
+ bool LastWasNumericEscape = false;
+ for (const Node *Element : *this) {
+ if (Element->getKind() != Node::KIntegerLiteral)
+ return Fail();
+ int integer_value = 0;
+ for (char c : static_cast<const IntegerLiteral *>(Element)->value()) {
+ if (c < '0' || c > '9' || integer_value > 25)
+ return Fail();
+ integer_value *= 10;
+ integer_value += c - '0';
+ }
+ if (integer_value > 255)
+ return Fail();
+
+ // Insert a `""` to avoid accidentally extending a numeric escape.
+ if (LastWasNumericEscape) {
+ if ((integer_value >= '0' && integer_value <= '9') ||
+ (integer_value >= 'a' && integer_value <= 'f') ||
+ (integer_value >= 'A' && integer_value <= 'F')) {
+ OB += "\"\"";
+ }
+ }
+
+ LastWasNumericEscape = false;
+
+ // Determine how to print this character.
+ switch (integer_value) {
+ case '\a':
+ OB += "\\a";
+ break;
+ case '\b':
+ OB += "\\b";
+ break;
+ case '\f':
+ OB += "\\f";
+ break;
+ case '\n':
+ OB += "\\n";
+ break;
+ case '\r':
+ OB += "\\r";
+ break;
+ case '\t':
+ OB += "\\t";
+ break;
+ case '\v':
+ OB += "\\v";
+ break;
+
+ case '"':
+ OB += "\\\"";
+ break;
+ case '\\':
+ OB += "\\\\";
+ break;
+
+ default:
+ // We assume that the character is ASCII, and use a numeric escape for all
+ // remaining non-printable ASCII characters.
+ if (integer_value < 32 || integer_value == 127) {
+ constexpr char Hex[] = "0123456789ABCDEF";
+ OB += '\\';
+ if (integer_value > 7)
+ OB += 'x';
+ if (integer_value >= 16)
+ OB += Hex[integer_value >> 4];
+ OB += Hex[integer_value & 0xF];
+ LastWasNumericEscape = true;
+ break;
+ }
+
+ // Assume all remaining characters are directly printable.
+ OB += (char)integer_value;
+ break;
+ }
+ }
+ OB += '"';
+ return true;
+}
+
template <typename Derived, typename Alloc> struct AbstractManglingParser {
const char *First;
const char *Last;
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know the demangler very well, but I looked at the implementation and it makes sense to me. Others might have more strongly rooted opinions than me, so I'd suggest waiting for at least one other reviewer.
The functionality itself is amazing though, that greatly improves the output of the demangler.
@urnathan It looks like you've been working in the libc++abi demangler more than most folks recently, could you take a look? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good - thanks! (still happy to hear from @urnathan if he's got thoughts, but post-commit seems fine at this point)
…lvm#109021) This improves the demangling for non-type template arguments that contain string literals. Previously we'd produce char [4]{(char)65, (char)66, (char)67} (which isn't valid C or C++), and now we produce `"ABC"`. The new demangling is always shorter, even when using an escape sequence for every character, and much more readable when the char array contains text.
nice! |
This only sets the symbolizer for our more used targets; not sure if there's a great way to set it everywhere (I suppose I could try wrapping cc_binary etc rules if there's a strong preference). There is a downside here, symbolizing a fastbuild crash seems to take about 3s. Not sure if there's a good way to get a faster llvm-symbolizer execution...? I tried running with the new LLVM update without the LLVM_SYMBOLIZER_PATH, and it looks like that's insufficient. With the settings, I now get readable crashes: ``` #9 0x000055dc007d1724 void Carbon::Internal::CheckFail<Carbon::TemplateString<5>{"FATAL"}, Carbon::TemplateString<27>{"toolchain/driver/driver.cpp"}, 84, Carbon::TemplateString<0>{}, Carbon::TemplateString<3>{"err"}>() (/usr/local/google/home/jperkins/.cache/bazel/_bazel_jperkins/85deb7d9d96f7e0e80b42618a55969d7/sandbox/linux-sandbox/9383/execroot/_main/bazel-out/k8-fastbuild/bin/toolchain/testing/file_test.runfiles/_main/toolchain/testing/file_test+0x2894724) ``` Note the LLVM update is for llvm/llvm-project#109021
This only sets the symbolizer for our more used targets; not sure if there's a great way to set it everywhere (I suppose I could try wrapping cc_binary etc rules if there's a strong preference). There is a downside here, symbolizing a fastbuild crash seems to take about 3s. Not sure if there's a good way to get a faster llvm-symbolizer execution...? I tried running with the new LLVM update without the LLVM_SYMBOLIZER_PATH, and it looks like that's insufficient. With the settings, I now get readable crashes: ``` #9 0x000055dc007d1724 void Carbon::Internal::CheckFail<Carbon::TemplateString<5>{"FATAL"}, Carbon::TemplateString<27>{"toolchain/driver/driver.cpp"}, 84, Carbon::TemplateString<0>{}, Carbon::TemplateString<3>{"err"}>() (/usr/local/google/home/jperkins/.cache/bazel/_bazel_jperkins/85deb7d9d96f7e0e80b42618a55969d7/sandbox/linux-sandbox/9383/execroot/_main/bazel-out/k8-fastbuild/bin/toolchain/testing/file_test.runfiles/_main/toolchain/testing/file_test+0x2894724) ``` Note the LLVM update is for llvm/llvm-project#109021
This improves the demangling for non-type template arguments that
contain string literals. Previously we'd produce
(which isn't valid C or C++), and now we produce
"ABC"
.The new demangling is always shorter, even when using an escape sequence
for every character, and much more readable when the char array contains
text.