Skip to content

Commit 8c6f309

Browse files
authored
[clang] Introduce "binary" StringLiteral for #embed data (#127629)
StringLiteral is used as internal data of EmbedExpr and we directly use it as an initializer if a single EmbedExpr appears in the initializer list of a char array. It is fast and convenient, but it is causing problems when string literal character values are checked because #embed data values are within a range [0-2^(char width)] but ordinary StringLiteral is of maybe signed char type. This PR introduces new kind of StringLiteral to hold binary data coming from an embedded resource to mitigate these problems. The new kind of StringLiteral is not assumed to have signed char type. The new kind of StringLiteral also helps to prevent crashes when trying to find StringLiteral token locations since these simply do not exist for binary data. Fixes #119256
1 parent 68180d8 commit 8c6f309

File tree

5 files changed

+42
-4
lines changed

5 files changed

+42
-4
lines changed

clang/include/clang/AST/Expr.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1756,7 +1756,14 @@ enum class StringLiteralKind {
17561756
UTF8,
17571757
UTF16,
17581758
UTF32,
1759-
Unevaluated
1759+
Unevaluated,
1760+
// Binary kind of string literal is used for the data coming via #embed
1761+
// directive. File's binary contents is transformed to a special kind of
1762+
// string literal that in some cases may be used directly as an initializer
1763+
// and some features of classic string literals are not applicable to this
1764+
// kind of a string literal, for example finding a particular byte's source
1765+
// location for better diagnosing.
1766+
Binary
17601767
};
17611768

17621769
/// StringLiteral - This represents a string literal expression, e.g. "foo"
@@ -1888,6 +1895,8 @@ class StringLiteral final
18881895
int64_t getCodeUnitS(size_t I, uint64_t BitWidth) const {
18891896
int64_t V = getCodeUnit(I);
18901897
if (isOrdinary() || isWide()) {
1898+
// Ordinary and wide string literals have types that can be signed.
1899+
// It is important for checking C23 constexpr initializers.
18911900
unsigned Width = getCharByteWidth() * BitWidth;
18921901
llvm::APInt AInt(Width, (uint64_t)V);
18931902
V = AInt.getSExtValue();
@@ -5029,9 +5038,9 @@ class EmbedExpr final : public Expr {
50295038
assert(EExpr && CurOffset != ULLONG_MAX &&
50305039
"trying to dereference an invalid iterator");
50315040
IntegerLiteral *N = EExpr->FakeChildNode;
5032-
StringRef DataRef = EExpr->Data->BinaryData->getBytes();
50335041
N->setValue(*EExpr->Ctx,
5034-
llvm::APInt(N->getValue().getBitWidth(), DataRef[CurOffset],
5042+
llvm::APInt(N->getValue().getBitWidth(),
5043+
EExpr->Data->BinaryData->getCodeUnit(CurOffset),
50355044
N->getType()->isSignedIntegerType()));
50365045
// We want to return a reference to the fake child node in the
50375046
// EmbedExpr, not the local variable N.

clang/lib/AST/Expr.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,7 @@ unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target,
11041104
switch (SK) {
11051105
case StringLiteralKind::Ordinary:
11061106
case StringLiteralKind::UTF8:
1107+
case StringLiteralKind::Binary:
11071108
CharByteWidth = Target.getCharWidth();
11081109
break;
11091110
case StringLiteralKind::Wide:
@@ -1216,6 +1217,7 @@ void StringLiteral::outputString(raw_ostream &OS) const {
12161217
switch (getKind()) {
12171218
case StringLiteralKind::Unevaluated:
12181219
case StringLiteralKind::Ordinary:
1220+
case StringLiteralKind::Binary:
12191221
break; // no prefix.
12201222
case StringLiteralKind::Wide:
12211223
OS << 'L';
@@ -1332,6 +1334,11 @@ StringLiteral::getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
13321334
const LangOptions &Features,
13331335
const TargetInfo &Target, unsigned *StartToken,
13341336
unsigned *StartTokenByteOffset) const {
1337+
// No source location of bytes for binary literals since they don't come from
1338+
// source.
1339+
if (getKind() == StringLiteralKind::Binary)
1340+
return getStrTokenLoc(0);
1341+
13351342
assert((getKind() == StringLiteralKind::Ordinary ||
13361343
getKind() == StringLiteralKind::UTF8 ||
13371344
getKind() == StringLiteralKind::Unevaluated) &&

clang/lib/Parse/ParseInit.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ ExprResult Parser::createEmbedExpr() {
445445
Context.MakeIntValue(Str.size(), Context.getSizeType());
446446
QualType ArrayTy = Context.getConstantArrayType(
447447
Ty, ArraySize, nullptr, ArraySizeModifier::Normal, 0);
448-
return StringLiteral::Create(Context, Str, StringLiteralKind::Ordinary,
448+
return StringLiteral::Create(Context, Str, StringLiteralKind::Binary,
449449
false, ArrayTy, StartLoc);
450450
};
451451

clang/lib/Sema/SemaInit.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ static StringInitFailureKind IsStringInit(Expr *Init, const ArrayType *AT,
106106
return SIF_None;
107107
[[fallthrough]];
108108
case StringLiteralKind::Ordinary:
109+
case StringLiteralKind::Binary:
109110
// char array can be initialized with a narrow string.
110111
// Only allow char x[] = "foo"; not char x[] = L"foo";
111112
if (ElemTy->isCharType())
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -std=c23
2+
3+
static constexpr unsigned char data[] = {
4+
#embed "big_char.txt"
5+
};
6+
7+
static constexpr char data1[] = {
8+
#embed "big_char.txt" // expected-error {{constexpr initializer evaluates to 255 which is not exactly representable in type 'const char'}}
9+
};
10+
11+
static constexpr int data2[] = {
12+
#embed "big_char.txt"
13+
};
14+
15+
static constexpr unsigned data3[] = {
16+
#embed "big_char.txt" suffix(, -1) // expected-error {{constexpr initializer evaluates to -1 which is not exactly representable in type 'const unsigned int'}}
17+
};
18+
19+
static constexpr int data4[] = {
20+
#embed "big_char.txt" suffix(, -1)
21+
};

0 commit comments

Comments
 (0)