|
20 | 20 | #include "llvm/CodeGen/TargetPassConfig.h"
|
21 | 21 | #include "llvm/IR/IRBuilder.h"
|
22 | 22 | #include "llvm/IR/InstVisitor.h"
|
| 23 | +#include "llvm/IR/IntrinsicsAMDGPU.h" |
23 | 24 | #include "llvm/Support/CommandLine.h"
|
24 | 25 | #include "llvm/Support/KnownBits.h"
|
25 | 26 | #include "llvm/Transforms/Utils/Local.h"
|
@@ -75,6 +76,7 @@ class LiveRegOptimizer {
|
75 | 76 | Module &Mod;
|
76 | 77 | const DataLayout &DL;
|
77 | 78 | const GCNSubtarget &ST;
|
| 79 | + |
78 | 80 | /// The scalar type to convert to
|
79 | 81 | Type *const ConvertToScalar;
|
80 | 82 | /// The set of visited Instructions
|
@@ -125,6 +127,131 @@ class LiveRegOptimizer {
|
125 | 127 | return LK.first != TargetLoweringBase::TypeLegal;
|
126 | 128 | }
|
127 | 129 |
|
| 130 | + /// Check if intrinsic natively operates on 8-bit or 16-bit |
| 131 | + bool isNativeIntrinsic(Intrinsic::ID ID) { |
| 132 | + switch (ID) { |
| 133 | + case Intrinsic::amdgcn_dot4_f32_fp8_bf8: |
| 134 | + case Intrinsic::amdgcn_dot4_f32_bf8_fp8: |
| 135 | + case Intrinsic::amdgcn_dot4_f32_fp8_fp8: |
| 136 | + case Intrinsic::amdgcn_dot4_f32_bf8_bf8: |
| 137 | + case Intrinsic::amdgcn_mfma_i32_4x4x4i8: |
| 138 | + case Intrinsic::amdgcn_mfma_i32_16x16x4i8: |
| 139 | + case Intrinsic::amdgcn_mfma_i32_32x32x4i8: |
| 140 | + case Intrinsic::amdgcn_mfma_i32_16x16x16i8: |
| 141 | + case Intrinsic::amdgcn_mfma_i32_32x32x8i8: |
| 142 | + case Intrinsic::amdgcn_mfma_i32_16x16x64_i8: |
| 143 | + case Intrinsic::amdgcn_mfma_i32_32x32x32_i8: |
| 144 | + case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: |
| 145 | + case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: |
| 146 | + case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8: |
| 147 | + case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8: |
| 148 | + case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8: |
| 149 | + case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8: |
| 150 | + case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8: |
| 151 | + case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8: |
| 152 | + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8: |
| 153 | + case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: |
| 154 | + case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: |
| 155 | + case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: |
| 156 | + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: |
| 157 | + case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: |
| 158 | + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: |
| 159 | + case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: |
| 160 | + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: |
| 161 | + case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: |
| 162 | + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: |
| 163 | + case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: |
| 164 | + case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: |
| 165 | + case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: |
| 166 | + case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: |
| 167 | + case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: |
| 168 | + case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: |
| 169 | + case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: |
| 170 | + case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: |
| 171 | + case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: |
| 172 | + case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: |
| 173 | + case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: |
| 174 | + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8: |
| 175 | + case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8: |
| 176 | + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8: |
| 177 | + case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8: |
| 178 | + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: |
| 179 | + case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: |
| 180 | + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: |
| 181 | + case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: |
| 182 | + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8: |
| 183 | + case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: |
| 184 | + case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4: |
| 185 | + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: |
| 186 | + case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: |
| 187 | + case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: |
| 188 | + case Intrinsic::amdgcn_raw_buffer_store_format: |
| 189 | + case Intrinsic::amdgcn_raw_buffer_store: |
| 190 | + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: |
| 191 | + case Intrinsic::amdgcn_raw_ptr_buffer_store: |
| 192 | + case Intrinsic::amdgcn_struct_buffer_store_format: |
| 193 | + case Intrinsic::amdgcn_struct_buffer_store: |
| 194 | + case Intrinsic::amdgcn_struct_ptr_buffer_store_format: |
| 195 | + case Intrinsic::amdgcn_struct_ptr_buffer_store: |
| 196 | + case Intrinsic::amdgcn_raw_tbuffer_store: |
| 197 | + case Intrinsic::amdgcn_raw_ptr_tbuffer_store: |
| 198 | + case Intrinsic::amdgcn_struct_ptr_tbuffer_store: |
| 199 | + case Intrinsic::amdgcn_struct_tbuffer_store: |
| 200 | + return true; |
| 201 | + default: |
| 202 | + return false; |
| 203 | + } |
| 204 | + } |
| 205 | + |
| 206 | + bool isOpLegal(Instruction *I) { |
| 207 | + if (const auto *Intr = dyn_cast<IntrinsicInst>(I)) { |
| 208 | + Intrinsic::ID ID = Intr->getIntrinsicID(); |
| 209 | + if (isNativeIntrinsic(ID)) |
| 210 | + return true; |
| 211 | + } |
| 212 | + // Stores |
| 213 | + if (isa<StoreInst>(I)) |
| 214 | + return true; |
| 215 | + return false; |
| 216 | + } |
| 217 | + |
| 218 | + bool isCoercionProfitable(Instruction *II) { |
| 219 | + SmallPtrSet<Instruction *, 4> CVisited; |
| 220 | + SmallVector<Instruction *, 4> UserList; |
| 221 | + |
| 222 | + // Check users for profitable conditions (across block user which can |
| 223 | + // natively handle the illegal vector). |
| 224 | + for (User *V : II->users()) |
| 225 | + if (auto *UseInst = dyn_cast<Instruction>(V)) |
| 226 | + UserList.push_back(UseInst); |
| 227 | + |
| 228 | + auto IsLookThru = [](Instruction *II) { |
| 229 | + if (const auto *Intr = dyn_cast<IntrinsicInst>(II)) |
| 230 | + return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm; |
| 231 | + return isa<PHINode>(II) || isa<ShuffleVectorInst>(II) || |
| 232 | + isa<InsertElementInst>(II) || isa<ExtractElementInst>(II) || |
| 233 | + isa<CastInst>(II); |
| 234 | + }; |
| 235 | + |
| 236 | + while (!UserList.empty()) { |
| 237 | + auto CII = UserList.pop_back_val(); |
| 238 | + if (!CVisited.insert(CII).second) |
| 239 | + continue; |
| 240 | + |
| 241 | + if (CII->getParent() == II->getParent() && !IsLookThru(II)) |
| 242 | + continue; |
| 243 | + |
| 244 | + if (isOpLegal(CII)) |
| 245 | + return true; |
| 246 | + |
| 247 | + if (IsLookThru(CII)) |
| 248 | + for (User *V : CII->users()) |
| 249 | + if (auto *UseInst = dyn_cast<Instruction>(V)) |
| 250 | + UserList.push_back(UseInst); |
| 251 | + } |
| 252 | + return false; |
| 253 | + } |
| 254 | + |
128 | 255 | LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
|
129 | 256 | : Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
|
130 | 257 | ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
|
@@ -259,6 +386,9 @@ bool LiveRegOptimizer::optimizeLiveType(
|
259 | 386 | if (!shouldReplace(II->getType()))
|
260 | 387 | continue;
|
261 | 388 |
|
| 389 | + if (!isCoercionProfitable(II)) |
| 390 | + continue; |
| 391 | + |
262 | 392 | if (PHINode *Phi = dyn_cast<PHINode>(II)) {
|
263 | 393 | PhiNodes.insert(Phi);
|
264 | 394 | // Collect all the incoming values of problematic PHI nodes.
|
@@ -478,7 +608,6 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
|
478 | 608 | PreservedAnalyses
|
479 | 609 | AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
|
480 | 610 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
481 |
| - |
482 | 611 | AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
|
483 | 612 | UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
|
484 | 613 |
|
|
0 commit comments