Skip to content

Commit d6082d4

Browse files
zygoloidtmsri
authored andcommitted
[demangle] Represent a char array initializer as a string literal. (llvm#109021)
This improves the demangling for non-type template arguments that contain string literals. Previously we'd produce char [4]{(char)65, (char)66, (char)67} (which isn't valid C or C++), and now we produce `"ABC"`. The new demangling is always shorter, even when using an escape sequence for every character, and much more readable when the char array contains text.
1 parent 80649a8 commit d6082d4

File tree

3 files changed

+256
-3
lines changed

3 files changed

+256
-3
lines changed

libcxxabi/src/demangle/ItaniumDemangle.h

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ template <class T, size_t N> class PODSmallVector {
156156
}
157157
};
158158

159+
class NodeArray;
160+
159161
// Base class of all AST nodes. The AST is built by the parser, then is
160162
// traversed by the printLeft/Right functions to produce a demangled string.
161163
class Node {
@@ -293,6 +295,13 @@ class Node {
293295
// implementation.
294296
virtual void printRight(OutputBuffer &) const {}
295297

298+
// Print an initializer list of this type. Returns true if we printed a custom
299+
// representation, false if nothing has been printed and the default
300+
// representation should be used.
301+
virtual bool printInitListAsType(OutputBuffer &, const NodeArray &) const {
302+
return false;
303+
}
304+
296305
virtual std::string_view getBaseName() const { return {}; }
297306

298307
// Silence compiler warnings, this dtor will never be called.
@@ -339,6 +348,10 @@ class NodeArray {
339348
FirstElement = false;
340349
}
341350
}
351+
352+
// Print an array of integer literals as a string literal. Returns whether we
353+
// could do so.
354+
bool printAsString(OutputBuffer &OB) const;
342355
};
343356

344357
struct NodeArrayNode : Node {
@@ -796,6 +809,15 @@ class ArrayType final : public Node {
796809
OB += "]";
797810
Base->printRight(OB);
798811
}
812+
813+
bool printInitListAsType(OutputBuffer &OB,
814+
const NodeArray &Elements) const override {
815+
if (Base->getKind() == KNameType &&
816+
static_cast<const NameType *>(Base)->getName() == "char") {
817+
return Elements.printAsString(OB);
818+
}
819+
return false;
820+
}
799821
};
800822

801823
class FunctionType final : public Node {
@@ -2225,8 +2247,11 @@ class InitListExpr : public Node {
22252247
template<typename Fn> void match(Fn F) const { F(Ty, Inits); }
22262248

22272249
void printLeft(OutputBuffer &OB) const override {
2228-
if (Ty)
2250+
if (Ty) {
2251+
if (Ty->printInitListAsType(OB, Inits))
2252+
return;
22292253
Ty->print(OB);
2254+
}
22302255
OB += '{';
22312256
Inits.printWithComma(OB);
22322257
OB += '}';
@@ -2433,6 +2458,8 @@ class IntegerLiteral : public Node {
24332458
if (Type.size() <= 3)
24342459
OB += Type;
24352460
}
2461+
2462+
std::string_view value() const { return Value; }
24362463
};
24372464

24382465
class RequiresExpr : public Node {
@@ -2604,6 +2631,93 @@ template<typename NodeT> struct NodeKind;
26042631
};
26052632
#include "ItaniumNodes.def"
26062633

2634+
inline bool NodeArray::printAsString(OutputBuffer &OB) const {
2635+
auto Fail = [&OB, StartPos = OB.getCurrentPosition()] {
2636+
OB.setCurrentPosition(StartPos);
2637+
return false;
2638+
};
2639+
2640+
OB += '"';
2641+
bool LastWasNumericEscape = false;
2642+
for (const Node *Element : *this) {
2643+
if (Element->getKind() != Node::KIntegerLiteral)
2644+
return Fail();
2645+
int integer_value = 0;
2646+
for (char c : static_cast<const IntegerLiteral *>(Element)->value()) {
2647+
if (c < '0' || c > '9' || integer_value > 25)
2648+
return Fail();
2649+
integer_value *= 10;
2650+
integer_value += c - '0';
2651+
}
2652+
if (integer_value > 255)
2653+
return Fail();
2654+
2655+
// Insert a `""` to avoid accidentally extending a numeric escape.
2656+
if (LastWasNumericEscape) {
2657+
if ((integer_value >= '0' && integer_value <= '9') ||
2658+
(integer_value >= 'a' && integer_value <= 'f') ||
2659+
(integer_value >= 'A' && integer_value <= 'F')) {
2660+
OB += "\"\"";
2661+
}
2662+
}
2663+
2664+
LastWasNumericEscape = false;
2665+
2666+
// Determine how to print this character.
2667+
switch (integer_value) {
2668+
case '\a':
2669+
OB += "\\a";
2670+
break;
2671+
case '\b':
2672+
OB += "\\b";
2673+
break;
2674+
case '\f':
2675+
OB += "\\f";
2676+
break;
2677+
case '\n':
2678+
OB += "\\n";
2679+
break;
2680+
case '\r':
2681+
OB += "\\r";
2682+
break;
2683+
case '\t':
2684+
OB += "\\t";
2685+
break;
2686+
case '\v':
2687+
OB += "\\v";
2688+
break;
2689+
2690+
case '"':
2691+
OB += "\\\"";
2692+
break;
2693+
case '\\':
2694+
OB += "\\\\";
2695+
break;
2696+
2697+
default:
2698+
// We assume that the character is ASCII, and use a numeric escape for all
2699+
// remaining non-printable ASCII characters.
2700+
if (integer_value < 32 || integer_value == 127) {
2701+
constexpr char Hex[] = "0123456789ABCDEF";
2702+
OB += '\\';
2703+
if (integer_value > 7)
2704+
OB += 'x';
2705+
if (integer_value >= 16)
2706+
OB += Hex[integer_value >> 4];
2707+
OB += Hex[integer_value & 0xF];
2708+
LastWasNumericEscape = true;
2709+
break;
2710+
}
2711+
2712+
// Assume all remaining characters are directly printable.
2713+
OB += (char)integer_value;
2714+
break;
2715+
}
2716+
}
2717+
OB += '"';
2718+
return true;
2719+
}
2720+
26072721
template <typename Derived, typename Alloc> struct AbstractManglingParser {
26082722
const char *First;
26092723
const char *Last;

libcxxabi/test/test_demangle.pass.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30037,7 +30037,32 @@ const char* cases[][2] =
3003730037
// FIXME: This is not valid pointer-to-member syntax.
3003830038
{"_Z1fIXtl1DmcM7DerivedKiadL_ZN11MoreDerived1zEEn8EEEEvv", "void f<D{(int const Derived::*)(&MoreDerived::z)}>()"},
3003930039
{"_Z1fIXtl1Edi1nLi42EEEEvv", "void f<E{.n = 42}>()"},
30040-
{"_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE", "template parameter object for S{char [32]{(char)104, (char)101, (char)108, (char)108, (char)111, (char)32, (char)119, (char)111, (char)114, (char)108, (char)100}}"},
30040+
// Arrays of char are formatted as string literals. Escape sequences are
30041+
// used for non-printable ASCII characters.
30042+
// FIXME: We should do the same for arrays of charN_t and wchar_t.
30043+
{"_ZTAXtl1StlA32_cLc104ELc101ELc108ELc108ELc111ELc32ELc119ELc111ELc114ELc108ELc100EEEE", "template parameter object for S{\"hello world\"}"},
30044+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc108ELc108ELc111EEEEEvv", "void f<Hello{\"Hello\"}>()"},
30045+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc108ELc111EEEEEvv", "void f<Hello{\"Helo\"}>()"},
30046+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc0ELc108ELc111EEEEEvv", "void f<Hello{\"He\\0lo\"}>()"},
30047+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc1ELc108ELc111EEEEEvv", "void f<Hello{\"He\\1lo\"}>()"},
30048+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc6ELc108ELc111EEEEEvv", "void f<Hello{\"He\\6lo\"}>()"},
30049+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc7ELc108ELc111EEEEEvv", "void f<Hello{\"He\\alo\"}>()"},
30050+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc8ELc108ELc111EEEEEvv", "void f<Hello{\"He\\blo\"}>()"},
30051+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc9ELc108ELc111EEEEEvv", "void f<Hello{\"He\\tlo\"}>()"},
30052+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc10ELc108ELc111EEEEEvv", "void f<Hello{\"He\\nlo\"}>()"},
30053+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc11ELc108ELc111EEEEEvv", "void f<Hello{\"He\\vlo\"}>()"},
30054+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc12ELc108ELc111EEEEEvv", "void f<Hello{\"He\\flo\"}>()"},
30055+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc13ELc108ELc111EEEEEvv", "void f<Hello{\"He\\rlo\"}>()"},
30056+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc14ELc108ELc111EEEEEvv", "void f<Hello{\"He\\xElo\"}>()"},
30057+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc15ELc108ELc111EEEEEvv", "void f<Hello{\"He\\xFlo\"}>()"},
30058+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc16ELc108ELc111EEEEEvv", "void f<Hello{\"He\\x10lo\"}>()"},
30059+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc34ELc108ELc111EEEEEvv", "void f<Hello{\"He\\\"lo\"}>()"},
30060+
{"_Z1fIXtl5HellotlA6_cLc72ELc101ELc92ELc108ELc111EEEEEvv", "void f<Hello{\"He\\\\lo\"}>()"},
30061+
{"_Z1fIXtl5HellotlA6_cLc15ELc101ELc108ELc108ELc111EEEEEvv", "void f<Hello{\"\\xF\"\"ello\"}>()"},
30062+
{"_Z1fIXtl5HellotlA6_cLc240ELc159ELc152ELc138ELc33EEEEEvv", "void f<Hello{\"😊!\"}>()"},
30063+
// Even non-null-terminated strings get this treatment, even though this
30064+
// isn't valid C++ syntax to initialize an array of char.
30065+
{"_Z1fIXtl5HellotlA5_cLc72ELc101ELc108ELc108ELc111EEEEEvv", "void f<Hello{\"Hello\"}>()"},
3004130066

3004230067
// FIXME: This is wrong; the S2_ backref should expand to OT_ and then to
3004330068
// "double&&". But we can't cope with a substitution that represents a

llvm/include/llvm/Demangle/ItaniumDemangle.h

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ template <class T, size_t N> class PODSmallVector {
156156
}
157157
};
158158

159+
class NodeArray;
160+
159161
// Base class of all AST nodes. The AST is built by the parser, then is
160162
// traversed by the printLeft/Right functions to produce a demangled string.
161163
class Node {
@@ -293,6 +295,13 @@ class Node {
293295
// implementation.
294296
virtual void printRight(OutputBuffer &) const {}
295297

298+
// Print an initializer list of this type. Returns true if we printed a custom
299+
// representation, false if nothing has been printed and the default
300+
// representation should be used.
301+
virtual bool printInitListAsType(OutputBuffer &, const NodeArray &) const {
302+
return false;
303+
}
304+
296305
virtual std::string_view getBaseName() const { return {}; }
297306

298307
// Silence compiler warnings, this dtor will never be called.
@@ -339,6 +348,10 @@ class NodeArray {
339348
FirstElement = false;
340349
}
341350
}
351+
352+
// Print an array of integer literals as a string literal. Returns whether we
353+
// could do so.
354+
bool printAsString(OutputBuffer &OB) const;
342355
};
343356

344357
struct NodeArrayNode : Node {
@@ -796,6 +809,15 @@ class ArrayType final : public Node {
796809
OB += "]";
797810
Base->printRight(OB);
798811
}
812+
813+
bool printInitListAsType(OutputBuffer &OB,
814+
const NodeArray &Elements) const override {
815+
if (Base->getKind() == KNameType &&
816+
static_cast<const NameType *>(Base)->getName() == "char") {
817+
return Elements.printAsString(OB);
818+
}
819+
return false;
820+
}
799821
};
800822

801823
class FunctionType final : public Node {
@@ -2225,8 +2247,11 @@ class InitListExpr : public Node {
22252247
template<typename Fn> void match(Fn F) const { F(Ty, Inits); }
22262248

22272249
void printLeft(OutputBuffer &OB) const override {
2228-
if (Ty)
2250+
if (Ty) {
2251+
if (Ty->printInitListAsType(OB, Inits))
2252+
return;
22292253
Ty->print(OB);
2254+
}
22302255
OB += '{';
22312256
Inits.printWithComma(OB);
22322257
OB += '}';
@@ -2433,6 +2458,8 @@ class IntegerLiteral : public Node {
24332458
if (Type.size() <= 3)
24342459
OB += Type;
24352460
}
2461+
2462+
std::string_view value() const { return Value; }
24362463
};
24372464

24382465
class RequiresExpr : public Node {
@@ -2604,6 +2631,93 @@ template<typename NodeT> struct NodeKind;
26042631
};
26052632
#include "ItaniumNodes.def"
26062633

2634+
inline bool NodeArray::printAsString(OutputBuffer &OB) const {
2635+
auto Fail = [&OB, StartPos = OB.getCurrentPosition()] {
2636+
OB.setCurrentPosition(StartPos);
2637+
return false;
2638+
};
2639+
2640+
OB += '"';
2641+
bool LastWasNumericEscape = false;
2642+
for (const Node *Element : *this) {
2643+
if (Element->getKind() != Node::KIntegerLiteral)
2644+
return Fail();
2645+
int integer_value = 0;
2646+
for (char c : static_cast<const IntegerLiteral *>(Element)->value()) {
2647+
if (c < '0' || c > '9' || integer_value > 25)
2648+
return Fail();
2649+
integer_value *= 10;
2650+
integer_value += c - '0';
2651+
}
2652+
if (integer_value > 255)
2653+
return Fail();
2654+
2655+
// Insert a `""` to avoid accidentally extending a numeric escape.
2656+
if (LastWasNumericEscape) {
2657+
if ((integer_value >= '0' && integer_value <= '9') ||
2658+
(integer_value >= 'a' && integer_value <= 'f') ||
2659+
(integer_value >= 'A' && integer_value <= 'F')) {
2660+
OB += "\"\"";
2661+
}
2662+
}
2663+
2664+
LastWasNumericEscape = false;
2665+
2666+
// Determine how to print this character.
2667+
switch (integer_value) {
2668+
case '\a':
2669+
OB += "\\a";
2670+
break;
2671+
case '\b':
2672+
OB += "\\b";
2673+
break;
2674+
case '\f':
2675+
OB += "\\f";
2676+
break;
2677+
case '\n':
2678+
OB += "\\n";
2679+
break;
2680+
case '\r':
2681+
OB += "\\r";
2682+
break;
2683+
case '\t':
2684+
OB += "\\t";
2685+
break;
2686+
case '\v':
2687+
OB += "\\v";
2688+
break;
2689+
2690+
case '"':
2691+
OB += "\\\"";
2692+
break;
2693+
case '\\':
2694+
OB += "\\\\";
2695+
break;
2696+
2697+
default:
2698+
// We assume that the character is ASCII, and use a numeric escape for all
2699+
// remaining non-printable ASCII characters.
2700+
if (integer_value < 32 || integer_value == 127) {
2701+
constexpr char Hex[] = "0123456789ABCDEF";
2702+
OB += '\\';
2703+
if (integer_value > 7)
2704+
OB += 'x';
2705+
if (integer_value >= 16)
2706+
OB += Hex[integer_value >> 4];
2707+
OB += Hex[integer_value & 0xF];
2708+
LastWasNumericEscape = true;
2709+
break;
2710+
}
2711+
2712+
// Assume all remaining characters are directly printable.
2713+
OB += (char)integer_value;
2714+
break;
2715+
}
2716+
}
2717+
OB += '"';
2718+
return true;
2719+
}
2720+
26072721
template <typename Derived, typename Alloc> struct AbstractManglingParser {
26082722
const char *First;
26092723
const char *Last;

0 commit comments

Comments
 (0)