Skip to content

Commit 8842313

Browse files
committed
Preserve metadata and address other review comments
1 parent 2c93413 commit 8842313

File tree

5 files changed

+62
-33
lines changed

5 files changed

+62
-33
lines changed

llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,23 +1319,36 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
13191319
if (EQClasses.size() < 2)
13201320
return;
13211321

1322+
auto CopyMetaDataFromTo = [&](Instruction *Src, Instruction *Dst) {
1323+
SmallVector<std::pair<unsigned, MDNode *>, 4> MD;
1324+
Src->getAllMetadata(MD);
1325+
for (const auto [ID, Node] : MD) {
1326+
Dst->setMetadata(ID, Node);
1327+
}
1328+
};
1329+
13221330
// For each class, determine if all instructions are of type int, FP or ptr.
13231331
// This information will help us determine the type instructions should be
13241332
// casted into.
13251333
MapVector<EqClassKey, Bitset<3>> ClassAllTy;
1326-
for (auto C : EQClasses) {
1327-
if (all_of(EQClasses[C.first], [](Instruction *I) {
1328-
return I->getType()->isIntOrIntVectorTy();
1329-
}))
1330-
ClassAllTy[C.first].set(0);
1331-
else if (all_of(EQClasses[C.first], [](Instruction *I) {
1332-
return I->getType()->isFPOrFPVectorTy();
1333-
}))
1334-
ClassAllTy[C.first].set(1);
1335-
else if (all_of(EQClasses[C.first], [](Instruction *I) {
1336-
return I->getType()->isPtrOrPtrVectorTy();
1337-
}))
1338-
ClassAllTy[C.first].set(2);
1334+
for (const auto &C : EQClasses) {
1335+
auto CommonTypeKind = [](Instruction *I) {
1336+
if (I->getType()->isIntOrIntVectorTy())
1337+
return 0;
1338+
if (I->getType()->isFPOrFPVectorTy())
1339+
return 1;
1340+
if (I->getType()->isPtrOrPtrVectorTy())
1341+
return 2;
1342+
return -1; // Invalid type kind
1343+
};
1344+
1345+
int FirstTypeKind = CommonTypeKind(EQClasses[C.first][0]);
1346+
if (FirstTypeKind != -1 &&
1347+
all_of(EQClasses[C.first], [&](Instruction *I) {
1348+
return CommonTypeKind(I) == FirstTypeKind;
1349+
})) {
1350+
ClassAllTy[C.first].set(FirstTypeKind);
1351+
}
13391352
}
13401353

13411354
// Loop over all equivalence classes and try to merge them. Keep track of
@@ -1359,6 +1372,11 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
13591372
if (Ptr1 != Ptr2 || AS1 != AS2 || IsLoad1 != IsLoad2 || TySize1 < TySize2)
13601373
continue;
13611374

1375+
// An All-FP class should only be merged into another All-FP class.
1376+
if ((ClassAllTy[EC1.first].test(1) && !ClassAllTy[EC2.first].test(1)) ||
1377+
(!ClassAllTy[EC1.first].test(2) && ClassAllTy[EC2.first].test(2)))
1378+
continue;
1379+
13621380
// Ensure all instructions in EC2 can be bitcasted into NewTy.
13631381
/// TODO: NewTyBits is needed as stuctured binded variables cannot be
13641382
/// captured by a lambda until C++20.
@@ -1384,8 +1402,8 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
13841402
}
13851403

13861404
for (auto *Inst : EC2.second) {
1387-
auto *Ptr = getLoadStorePointerOperand(Inst);
1388-
auto *OrigTy = Inst->getType();
1405+
Value *Ptr = getLoadStorePointerOperand(Inst);
1406+
Type *OrigTy = Inst->getType();
13891407
if (OrigTy == NewTy)
13901408
continue;
13911409
if (auto *LI = dyn_cast<LoadInst>(Inst)) {
@@ -1404,6 +1422,7 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
14041422
SI->getValueOperand()->getName() + ".cast");
14051423
auto *NewStore = Builder.CreateStore(
14061424
Cast, getLoadStorePointerOperand(SI), SI->isVolatile());
1425+
CopyMetaDataFromTo(SI, NewStore);
14071426
SI->eraseFromParent();
14081427
EQClasses[EC1.first].emplace_back(NewStore);
14091428
}
@@ -1413,7 +1432,7 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
14131432
// basic block. This is important to ensure that the instructions are
14141433
// vectorized in the correct order.
14151434
std::sort(EQClasses[EC1.first].begin(), EQClasses[EC1.first].end(),
1416-
[](Instruction *A, Instruction *B) {
1435+
[](const Instruction *A, const Instruction *B) {
14171436
return A && B && A->comesBefore(B);
14181437
});
14191438
ClassesToErase.insert(EC2.first);

llvm/test/CodeGen/AMDGPU/bitop3.ll

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,15 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
113113
}
114114

115115
define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
116-
; GCN-LABEL: and_and_and:
117-
; GCN: ; %bb.0:
118-
; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
119-
; GCN-NEXT: ; return to shader part epilog
116+
; GFX950-SDAG-LABEL: and_and_and:
117+
; GFX950-SDAG: ; %bb.0:
118+
; GFX950-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
119+
; GFX950-SDAG-NEXT: ; return to shader part epilog
120+
;
121+
; GFX950-GISEL-LABEL: and_and_and:
122+
; GFX950-GISEL: ; %bb.0:
123+
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
124+
; GFX950-GISEL-NEXT: ; return to shader part epilog
120125
%and1 = and i32 %a, %c
121126
%and2 = and i32 %and1, %b
122127
%ret_cast = bitcast i32 %and2 to float
@@ -126,10 +131,15 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
126131
; ========= Multi bit functions =========
127132

128133
define amdgpu_ps float @test_12(i32 %a, i32 %b) {
129-
; GCN-LABEL: test_12:
130-
; GCN: ; %bb.0:
131-
; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
132-
; GCN-NEXT: ; return to shader part epilog
134+
; GFX950-SDAG-LABEL: test_12:
135+
; GFX950-SDAG: ; %bb.0:
136+
; GFX950-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
137+
; GFX950-SDAG-NEXT: ; return to shader part epilog
138+
;
139+
; GFX950-GISEL-LABEL: test_12:
140+
; GFX950-GISEL: ; %bb.0:
141+
; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
142+
; GFX950-GISEL-NEXT: ; return to shader part epilog
133143
%nota = xor i32 %a, -1
134144
%and1 = and i32 %nota, %b
135145
%ret_cast = bitcast i32 %and1 to float

llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) {
4848
; CHECK-NEXT: {{ $}}
4949
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
5050
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
51-
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
52-
; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
51+
; CHECK-NEXT: %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
52+
; CHECK-NEXT: $vgpr0 = COPY %10
5353
; CHECK-NEXT: SI_RETURN implicit $vgpr0
5454
%result = or disjoint i32 %a, %b
5555
ret i32 %result
@@ -64,10 +64,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
6464
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
6565
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
6666
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
67-
; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
68-
; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
69-
; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]]
70-
; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]]
67+
; CHECK-NEXT: %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
68+
; CHECK-NEXT: %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
69+
; CHECK-NEXT: $vgpr0 = COPY %12
70+
; CHECK-NEXT: $vgpr1 = COPY %13
7171
; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
7272
%result = or disjoint <2 x i32> %a, %b
7373
ret <2 x i32> %result

llvm/test/CodeGen/AMDGPU/divrem24-assume.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
define amdgpu_kernel void @divrem24_assume(ptr addrspace(1) %arg, i32 %arg1) {
55
; CHECK-LABEL: @divrem24_assume(
66
; CHECK-NEXT: bb:
7-
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG0:![0-9]+]]
7+
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
88
; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[ARG1:%.*]], 42
99
; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]])
1010
; CHECK-NEXT: [[TMP0:%.*]] = uitofp i32 [[TMP]] to float

llvm/test/CodeGen/NVPTX/dag-cse.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
; Verify that loads with different memory types are not subject to CSE
1010
; once they are promoted to the same type.
1111
;
12-
; CHECK: ld.global.u16 %[[B1:rs[0-9]+]], [a];
13-
; CHECK: st.global.u16 [b], %[[B1]];
12+
; CHECK: ld.global.v2.u8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
13+
; CHECK: st.global.v2.u8 [b], {%[[B1]], %[[B2]]};
1414
;
1515
; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a];
1616
; CHECK: st.global.u32 [c], %[[C]];

0 commit comments

Comments
 (0)