Skip to content

Commit a637e56

Browse files
AMDGPU/GlobalISel: Temporal divergence lowering (non i1)
Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source.
1 parent 11a9bd2 commit a637e56

12 files changed

+179
-54
lines changed

llvm/include/llvm/ADT/GenericUniformityImpl.h

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,10 @@
5151
#include "llvm/ADT/SmallPtrSet.h"
5252
#include "llvm/ADT/SparseBitVector.h"
5353
#include "llvm/ADT/StringExtras.h"
54+
#include "llvm/CodeGen/MachineInstr.h"
55+
#include "llvm/Support/Debug.h"
5456
#include "llvm/Support/raw_ostream.h"
57+
#include <string>
5558

5659
#define DEBUG_TYPE "uniformity"
5760

@@ -342,6 +345,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
342345
typename SyncDependenceAnalysisT::DivergenceDescriptor;
343346
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
344347

348+
using TemporalDivergenceTuple =
349+
std::tuple<ConstValueRefT, InstructionT *, const CycleT *>;
350+
345351
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
346352
const TargetTransformInfo *TTI)
347353
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
@@ -396,6 +402,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
396402

397403
void print(raw_ostream &out) const;
398404

405+
SmallVector<TemporalDivergenceTuple, 8> TemporalDivergenceList;
406+
407+
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
408+
const CycleT *);
409+
399410
protected:
400411
/// \brief Value/block pair representing a single phi input.
401412
struct PhiInput {
@@ -1129,6 +1140,13 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
11291140
}
11301141
}
11311142

1143+
template <typename ContextT>
1144+
void GenericUniformityAnalysisImpl<ContextT>::recordTemporalDivergence(
1145+
ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) {
1146+
TemporalDivergenceList.emplace_back(Val, const_cast<InstructionT *>(User),
1147+
Cycle);
1148+
}
1149+
11321150
template <typename ContextT>
11331151
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
11341152
const InstructionT &Instr) const {
@@ -1146,6 +1164,12 @@ template <typename ContextT>
11461164
void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11471165
bool haveDivergentArgs = false;
11481166

1167+
// When we print Value, LLVM IR instruction, we want to print extra new line.
1168+
// In LLVM IR print function for Value does not print new line at the end.
1169+
// In MIR print for MachineInstr prints new line at the end.
1170+
constexpr bool IsMIR = std::is_same<InstructionT, MachineInstr>::value;
1171+
std::string NewLine = IsMIR ? "" : "\n";
1172+
11491173
// Control flow instructions may be divergent even if their inputs are
11501174
// uniform. Thus, although exceedingly rare, it is possible to have a program
11511175
// with no divergent values but with divergent control structures.
@@ -1180,6 +1204,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11801204
}
11811205
}
11821206

1207+
if (!TemporalDivergenceList.empty()) {
1208+
OS << "\nTEMPORAL DIVERGENCE LIST:\n";
1209+
1210+
for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) {
1211+
OS << "Value :" << Context.print(Val) << NewLine
1212+
<< "Used by :" << Context.print(UseInst) << NewLine
1213+
<< "Outside cycle :" << Cycle->print(Context) << "\n\n";
1214+
}
1215+
}
1216+
11831217
for (auto &block : F) {
11841218
OS << "\nBLOCK " << Context.print(&block) << '\n';
11851219

@@ -1191,7 +1225,7 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11911225
OS << " DIVERGENT: ";
11921226
else
11931227
OS << " ";
1194-
OS << Context.print(value) << '\n';
1228+
OS << Context.print(value) << NewLine;
11951229
}
11961230

11971231
OS << "TERMINATORS\n";
@@ -1203,13 +1237,21 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
12031237
OS << " DIVERGENT: ";
12041238
else
12051239
OS << " ";
1206-
OS << Context.print(T) << '\n';
1240+
OS << Context.print(T) << NewLine;
12071241
}
12081242

12091243
OS << "END BLOCK\n";
12101244
}
12111245
}
12121246

1247+
template <typename ContextT>
1248+
iterator_range<
1249+
typename GenericUniformityInfo<ContextT>::TemporalDivergenceTuple *>
1250+
GenericUniformityInfo<ContextT>::getTemporalDivergenceList() const {
1251+
return make_range(DA->TemporalDivergenceList.begin(),
1252+
DA->TemporalDivergenceList.end());
1253+
}
1254+
12131255
template <typename ContextT>
12141256
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
12151257
return DA->hasDivergence();

llvm/include/llvm/ADT/GenericUniformityInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ template <typename ContextT> class GenericUniformityInfo {
4040
using CycleInfoT = GenericCycleInfo<ContextT>;
4141
using CycleT = typename CycleInfoT::CycleT;
4242

43+
using TemporalDivergenceTuple =
44+
std::tuple<ConstValueRefT, InstructionT *, const CycleT *>;
45+
4346
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
4447
const TargetTransformInfo *TTI = nullptr);
4548
GenericUniformityInfo() = default;
@@ -78,6 +81,8 @@ template <typename ContextT> class GenericUniformityInfo {
7881

7982
void print(raw_ostream &Out) const;
8083

84+
iterator_range<TemporalDivergenceTuple *> getTemporalDivergenceList() const;
85+
8186
private:
8287
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
8388

llvm/lib/Analysis/UniformityAnalysis.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,12 @@ template <>
7979
void llvm::GenericUniformityAnalysisImpl<
8080
SSAContext>::propagateTemporalDivergence(const Instruction &I,
8181
const Cycle &DefCycle) {
82-
if (isDivergent(I))
83-
return;
8482
for (auto *User : I.users()) {
8583
auto *UserInstr = cast<Instruction>(User);
8684
if (DefCycle.contains(UserInstr->getParent()))
8785
continue;
8886
markDivergent(*UserInstr);
87+
recordTemporalDivergence(&I, UserInstr, &DefCycle);
8988
}
9089
}
9190

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
117117
if (!Op.getReg().isVirtual())
118118
continue;
119119
auto Reg = Op.getReg();
120-
if (isDivergent(Reg))
121-
continue;
122120
for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
123121
if (DefCycle.contains(UserInstr.getParent()))
124122
continue;
125123
markDivergent(UserInstr);
124+
125+
recordTemporalDivergence(Reg, &UserInstr, &DefCycle);
126126
}
127127
}
128128
}
@@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
193193

194194
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
195195
AU.setPreservesAll();
196-
AU.addRequired<MachineCycleInfoWrapperPass>();
196+
AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
197197
AU.addRequired<MachineDominatorTreeWrapperPass>();
198198
MachineFunctionPass::getAnalysisUsage(AU);
199199
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
1920
#include "SILowerI1Copies.h"
21+
#include "llvm/ADT/DenseMap.h"
2022
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2123
#include "llvm/CodeGen/MachineFunctionPass.h"
2224
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
@@ -78,6 +80,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
7880
Register DstReg, Register PrevReg,
7981
Register CurReg) override;
8082
void constrainAsLaneMask(Incoming &In) override;
83+
84+
bool lowerTemporalDivergence();
8185
};
8286

8387
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -188,6 +192,43 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
188192
In.Reg = Copy.getReg(0);
189193
}
190194

195+
void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
196+
Register NewReg) {
197+
for (MachineOperand &Op : Inst->operands()) {
198+
if (Op.isReg() && Op.getReg() == Reg)
199+
Op.setReg(NewReg);
200+
}
201+
}
202+
203+
bool DivergenceLoweringHelper::lowerTemporalDivergence() {
204+
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
205+
DenseMap<Register, Register> TDCache;
206+
207+
for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
208+
if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
209+
ILMA.isS32S64LaneMask(Reg))
210+
continue;
211+
212+
Register CachedTDCopy = TDCache.lookup(Reg);
213+
if (CachedTDCopy) {
214+
replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
215+
continue;
216+
}
217+
218+
MachineInstr *Inst = MRI->getVRegDef(Reg);
219+
MachineBasicBlock *MBB = Inst->getParent();
220+
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
221+
222+
Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
223+
B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
224+
.addUse(ExecReg, RegState::Implicit);
225+
226+
replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
227+
TDCache[Reg] = VgprReg;
228+
}
229+
return false;
230+
}
231+
191232
} // End anonymous namespace.
192233

193234
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -218,5 +259,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
218259

219260
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
220261

221-
return Helper.lowerPhis();
262+
bool Changed = false;
263+
// Temporal divergence lowering needs to inspect list of instructions used
264+
// outside cycle with divergent exit provided by uniformity analysis. Uniform
265+
// instructions from the list require lowering, no instruction is deleted.
266+
// Thus it needs to be run before lowerPhis that deletes phis that require
267+
// lowering and replaces them with new instructions.
268+
269+
// Non-i1 temporal divergence lowering.
270+
Changed |= Helper.lowerTemporalDivergence();
271+
Changed |= Helper.lowerPhis();
272+
return Changed;
222273
}

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class RegBankSelectHelper {
8383
MachineRegisterInfo &MRI;
8484
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
8585
const MachineUniformityInfo &MUI;
86+
const SIRegisterInfo &TRI;
8687
const RegisterBank *SgprRB;
8788
const RegisterBank *VgprRB;
8889
const RegisterBank *VccRB;
@@ -91,14 +92,29 @@ class RegBankSelectHelper {
9192
RegBankSelectHelper(MachineIRBuilder &B,
9293
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
9394
const MachineUniformityInfo &MUI,
94-
const RegisterBankInfo &RBI)
95-
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
95+
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
96+
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
9697
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
9798
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
9899
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
99100

101+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
102+
// the cycle
103+
// Note: uniformity analysis does not consider that registers with vgpr def
104+
// are divergent (you can have uniform value in vgpr).
105+
// - TODO: implicit use of $exec could be implemented as indicator that
106+
// instruction is divergent
107+
bool isTemporalDivergenceCopy(Register Reg) {
108+
MachineInstr *MI = MRI.getVRegDef(Reg);
109+
if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
110+
return false;
111+
112+
return MI->implicit_operands().begin()->getReg() == TRI.getExec();
113+
}
114+
100115
const RegisterBank *getRegBankToAssign(Register Reg) {
101-
if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
116+
if (!isTemporalDivergenceCopy(Reg) &&
117+
(MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
102118
return SgprRB;
103119
if (MRI.getType(Reg) == LLT::scalar(1))
104120
return VccRB;
@@ -209,7 +225,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
209225
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
210226
MachineRegisterInfo &MRI = *B.getMRI();
211227
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
212-
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
228+
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
229+
*ST.getRegBankInfo());
213230
// Virtual registers at this point don't have register banks.
214231
// Virtual registers in def and use operands of already inst-selected
215232
// instruction have register class.

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "GCNSubtarget.h"
1616
#include "llvm/CodeGen/MachineBasicBlock.h"
1717
#include "llvm/CodeGen/MachinePostDominators.h"
18+
#include "llvm/CodeGen/MachineRegisterInfo.h"
1819
#include "llvm/CodeGen/MachineSSAUpdater.h"
1920

2021
namespace llvm {
@@ -72,6 +73,11 @@ class PhiLoweringHelper {
7273
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
7374
}
7475

76+
void
77+
initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
78+
LaneMaskRegAttrs = Attrs;
79+
}
80+
7581
bool isLaneMaskReg(Register Reg) const {
7682
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
7783
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ body: |
471471
; GFX10-NEXT: bb.2:
472472
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
473473
; GFX10-NEXT: {{ $}}
474-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
474+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
475475
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
476476
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
477477
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
@@ -486,6 +486,7 @@ body: |
486486
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
487487
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
488488
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
489+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
489490
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
490491
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
491492
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -497,11 +498,11 @@ body: |
497498
; GFX10-NEXT: bb.4:
498499
; GFX10-NEXT: successors: %bb.2(0x80000000)
499500
; GFX10-NEXT: {{ $}}
500-
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
501+
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
501502
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
502503
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
503504
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
504-
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505506
; GFX10-NEXT: G_BR %bb.2
506507
; GFX10-NEXT: {{ $}}
507508
; GFX10-NEXT: bb.5:

0 commit comments

Comments
 (0)