Skip to content

Commit 45759fe

Browse files
authored
[AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases (#124624)
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer (see SIISelLowering::getPreferredVectorAction). LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between unpacked and packed. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to packed VGPR. Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, phi nodes to name a few.
1 parent aea7403 commit 45759fe

File tree

5 files changed

+356
-220
lines changed

5 files changed

+356
-220
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "llvm/CodeGen/TargetPassConfig.h"
2121
#include "llvm/IR/IRBuilder.h"
2222
#include "llvm/IR/InstVisitor.h"
23+
#include "llvm/IR/IntrinsicsAMDGPU.h"
2324
#include "llvm/Support/CommandLine.h"
2425
#include "llvm/Support/KnownBits.h"
2526
#include "llvm/Transforms/Utils/Local.h"
@@ -75,6 +76,7 @@ class LiveRegOptimizer {
7576
Module &Mod;
7677
const DataLayout &DL;
7778
const GCNSubtarget &ST;
79+
7880
/// The scalar type to convert to
7981
Type *const ConvertToScalar;
8082
/// The set of visited Instructions
@@ -125,6 +127,131 @@ class LiveRegOptimizer {
125127
return LK.first != TargetLoweringBase::TypeLegal;
126128
}
127129

130+
/// Check if intrinsic natively operates on 8-bit or 16-bit
131+
bool isNativeIntrinsic(Intrinsic::ID ID) {
132+
switch (ID) {
133+
case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
134+
case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
135+
case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
136+
case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
137+
case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
138+
case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
139+
case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
140+
case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
141+
case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
142+
case Intrinsic::amdgcn_mfma_i32_16x16x64_i8:
143+
case Intrinsic::amdgcn_mfma_i32_32x32x32_i8:
144+
case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
145+
case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
146+
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
147+
case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
148+
case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
149+
case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
150+
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
151+
case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
152+
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
153+
case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8:
154+
case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
155+
case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
156+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
157+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
158+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
159+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
160+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
161+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
162+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
163+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
164+
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
165+
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
166+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
167+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
168+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
169+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
170+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
171+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
172+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
173+
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
174+
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
175+
case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
176+
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
177+
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
178+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
179+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
180+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
181+
case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
182+
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
183+
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
184+
case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
185+
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
186+
case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
187+
case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
188+
case Intrinsic::amdgcn_raw_buffer_store_format:
189+
case Intrinsic::amdgcn_raw_buffer_store:
190+
case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
191+
case Intrinsic::amdgcn_raw_ptr_buffer_store:
192+
case Intrinsic::amdgcn_struct_buffer_store_format:
193+
case Intrinsic::amdgcn_struct_buffer_store:
194+
case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
195+
case Intrinsic::amdgcn_struct_ptr_buffer_store:
196+
case Intrinsic::amdgcn_raw_tbuffer_store:
197+
case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
198+
case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
199+
case Intrinsic::amdgcn_struct_tbuffer_store:
200+
return true;
201+
default:
202+
return false;
203+
}
204+
}
205+
206+
bool isOpLegal(Instruction *I) {
207+
if (const auto *Intr = dyn_cast<IntrinsicInst>(I)) {
208+
Intrinsic::ID ID = Intr->getIntrinsicID();
209+
if (isNativeIntrinsic(ID))
210+
return true;
211+
}
212+
// Stores
213+
if (isa<StoreInst>(I))
214+
return true;
215+
return false;
216+
}
217+
218+
bool isCoercionProfitable(Instruction *II) {
219+
SmallPtrSet<Instruction *, 4> CVisited;
220+
SmallVector<Instruction *, 4> UserList;
221+
222+
// Check users for profitable conditions (across block user which can
223+
// natively handle the illegal vector).
224+
for (User *V : II->users())
225+
if (auto *UseInst = dyn_cast<Instruction>(V))
226+
UserList.push_back(UseInst);
227+
228+
auto IsLookThru = [](Instruction *II) {
229+
if (const auto *Intr = dyn_cast<IntrinsicInst>(II))
230+
return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm;
231+
return isa<PHINode>(II) || isa<ShuffleVectorInst>(II) ||
232+
isa<InsertElementInst>(II) || isa<ExtractElementInst>(II) ||
233+
isa<CastInst>(II);
234+
};
235+
236+
while (!UserList.empty()) {
237+
auto CII = UserList.pop_back_val();
238+
if (!CVisited.insert(CII).second)
239+
continue;
240+
241+
if (CII->getParent() == II->getParent() && !IsLookThru(II))
242+
continue;
243+
244+
if (isOpLegal(CII))
245+
return true;
246+
247+
if (IsLookThru(CII))
248+
for (User *V : CII->users())
249+
if (auto *UseInst = dyn_cast<Instruction>(V))
250+
UserList.push_back(UseInst);
251+
}
252+
return false;
253+
}
254+
128255
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
129256
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
130257
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
@@ -259,6 +386,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259386
if (!shouldReplace(II->getType()))
260387
continue;
261388

389+
if (!isCoercionProfitable(II))
390+
continue;
391+
262392
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263393
PhiNodes.insert(Phi);
264394
// Collect all the incoming values of problematic PHI nodes.
@@ -478,7 +608,6 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478608
PreservedAnalyses
479609
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
480610
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
481-
482611
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
483612
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
484613

0 commit comments

Comments
 (0)