Skip to content

Commit 3e04401

Browse files
AMDGPU/GlobalISel: Temporal divergence lowering (non i1)
Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source.
1 parent 1728ab4 commit 3e04401

12 files changed

+153
-42
lines changed

llvm/include/llvm/ADT/GenericUniformityImpl.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
342342
typename SyncDependenceAnalysisT::DivergenceDescriptor;
343343
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
344344

345+
// Use outside cycle with divergent exit
346+
using UOCWDE =
347+
std::tuple<const InstructionT *, const InstructionT *, const CycleT *>;
348+
345349
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
346350
const TargetTransformInfo *TTI)
347351
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
@@ -395,6 +399,14 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
395399
}
396400

397401
void print(raw_ostream &out) const;
402+
SmallVector<UOCWDE, 8> UsesOutsideCycleWithDivergentExit;
403+
void recordUseOutsideCycleWithDivergentExit(const InstructionT *,
404+
const InstructionT *,
405+
const CycleT *);
406+
inline iterator_range<UOCWDE *> getUsesOutsideCycleWithDivergentExit() const {
407+
return make_range(UsesOutsideCycleWithDivergentExit.begin(),
408+
UsesOutsideCycleWithDivergentExit.end());
409+
}
398410

399411
protected:
400412
/// \brief Value/block pair representing a single phi input.
@@ -1129,6 +1141,14 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
11291141
}
11301142
}
11311143

1144+
template <typename ContextT>
1145+
void GenericUniformityAnalysisImpl<
1146+
ContextT>::recordUseOutsideCycleWithDivergentExit(const InstructionT *Inst,
1147+
const InstructionT *User,
1148+
const CycleT *Cycle) {
1149+
UsesOutsideCycleWithDivergentExit.emplace_back(Inst, User, Cycle);
1150+
}
1151+
11321152
template <typename ContextT>
11331153
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
11341154
const InstructionT &Instr) const {
@@ -1180,6 +1200,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11801200
}
11811201
}
11821202

1203+
if (!UsesOutsideCycleWithDivergentExit.empty()) {
1204+
OS << "\nUSES OUTSIDE CYCLES WITH DIVERGENT EXIT:\n";
1205+
1206+
for (auto [Inst, UseInst, Cycle] : UsesOutsideCycleWithDivergentExit) {
1207+
OS << "Inst :" << Context.print(Inst)
1208+
<< "Used by :" << Context.print(UseInst)
1209+
<< "Outside cycle :" << Cycle->print(Context) << "\n\n";
1210+
}
1211+
}
1212+
11831213
for (auto &block : F) {
11841214
OS << "\nBLOCK " << Context.print(&block) << '\n';
11851215

@@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
12101240
}
12111241
}
12121242

1243+
template <typename ContextT>
1244+
iterator_range<typename GenericUniformityInfo<ContextT>::UOCWDE *>
1245+
GenericUniformityInfo<ContextT>::getUsesOutsideCycleWithDivergentExit() const {
1246+
return make_range(DA->UsesOutsideCycleWithDivergentExit.begin(),
1247+
DA->UsesOutsideCycleWithDivergentExit.end());
1248+
}
1249+
12131250
template <typename ContextT>
12141251
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
12151252
return DA->hasDivergence();

llvm/include/llvm/ADT/GenericUniformityInfo.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ template <typename ContextT> class GenericUniformityInfo {
4040
using CycleInfoT = GenericCycleInfo<ContextT>;
4141
using CycleT = typename CycleInfoT::CycleT;
4242

43+
// Use outside cycle with divergent exit
44+
using UOCWDE =
45+
std::tuple<const InstructionT *, const InstructionT *, const CycleT *>;
46+
4347
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
4448
const TargetTransformInfo *TTI = nullptr);
4549
GenericUniformityInfo() = default;
@@ -78,6 +82,8 @@ template <typename ContextT> class GenericUniformityInfo {
7882

7983
void print(raw_ostream &Out) const;
8084

85+
iterator_range<UOCWDE *> getUsesOutsideCycleWithDivergentExit() const;
86+
8187
private:
8288
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
8389

llvm/lib/Analysis/UniformityAnalysis.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,12 @@ template <>
7979
void llvm::GenericUniformityAnalysisImpl<
8080
SSAContext>::propagateTemporalDivergence(const Instruction &I,
8181
const Cycle &DefCycle) {
82-
if (isDivergent(I))
83-
return;
8482
for (auto *User : I.users()) {
8583
auto *UserInstr = cast<Instruction>(User);
8684
if (DefCycle.contains(UserInstr->getParent()))
8785
continue;
8886
markDivergent(*UserInstr);
87+
recordUseOutsideCycleWithDivergentExit(&I, UserInstr, &DefCycle);
8988
}
9089
}
9190

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
117117
if (!Op.getReg().isVirtual())
118118
continue;
119119
auto Reg = Op.getReg();
120-
if (isDivergent(Reg))
121-
continue;
122-
for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
120+
for (const MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
123121
if (DefCycle.contains(UserInstr.getParent()))
124122
continue;
125123
markDivergent(UserInstr);
124+
125+
recordUseOutsideCycleWithDivergentExit(&I, &UserInstr, &DefCycle);
126126
}
127127
}
128128
}
@@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
193193

194194
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
195195
AU.setPreservesAll();
196-
AU.addRequired<MachineCycleInfoWrapperPass>();
196+
AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
197197
AU.addRequired<MachineDominatorTreeWrapperPass>();
198198
MachineFunctionPass::getAnalysisUsage(AU);
199199
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
1920
#include "SILowerI1Copies.h"
2021
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2122
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -45,7 +46,6 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4546
}
4647

4748
void getAnalysisUsage(AnalysisUsage &AU) const override {
48-
AU.setPreservesCFG();
4949
AU.addRequired<MachineDominatorTreeWrapperPass>();
5050
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
5151
AU.addRequired<MachineUniformityAnalysisPass>();
@@ -78,6 +78,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
7878
Register DstReg, Register PrevReg,
7979
Register CurReg) override;
8080
void constrainAsLaneMask(Incoming &In) override;
81+
82+
bool lowerTempDivergence();
8183
};
8284

8385
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
188190
In.Reg = Copy.getReg(0);
189191
}
190192

193+
void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
194+
Register NewReg) {
195+
for (MachineOperand &Op : Inst->operands()) {
196+
if (Op.isReg() && Op.getReg() == Reg)
197+
Op.setReg(NewReg);
198+
}
199+
}
200+
201+
bool DivergenceLoweringHelper::lowerTempDivergence() {
202+
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
203+
204+
for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) {
205+
Register Reg = Inst->getOperand(0).getReg();
206+
if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
207+
ILMA.isS32S64LaneMask(Reg))
208+
continue;
209+
210+
MachineInstr *MI = const_cast<MachineInstr *>(Inst);
211+
MachineBasicBlock *MBB = MI->getParent();
212+
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
213+
214+
Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
215+
B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
216+
.addUse(ExecReg, RegState::Implicit);
217+
218+
replaceUsesOfRegInInstWith(Reg, const_cast<MachineInstr *>(UseInst),
219+
VgprReg);
220+
}
221+
return false;
222+
}
223+
191224
} // End anonymous namespace.
192225

193226
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -218,5 +251,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
218251

219252
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
220253

221-
return Helper.lowerPhis();
254+
bool Changed = false;
255+
// Temporal divergence lowering needs to inspect list of instructions used
256+
// outside cycle with divergent exit provided by uniformity analysis. Uniform
257+
// instructions from the list require lowering, no instruction is deleted.
258+
// Thus it needs to be run before lowerPhis that deletes phis that require
259+
// lowering and replaces them with new instructions.
260+
261+
// Non-i1 temporal divergence lowering.
262+
Changed |= Helper.lowerTempDivergence();
263+
Changed |= Helper.lowerPhis();
264+
return Changed;
222265
}

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class RegBankSelectHelper {
8383
MachineRegisterInfo &MRI;
8484
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
8585
const MachineUniformityInfo &MUI;
86+
const SIRegisterInfo &TRI;
8687
const RegisterBank *SgprRB;
8788
const RegisterBank *VgprRB;
8889
const RegisterBank *VccRB;
@@ -91,14 +92,29 @@ class RegBankSelectHelper {
9192
RegBankSelectHelper(MachineIRBuilder &B,
9293
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
9394
const MachineUniformityInfo &MUI,
94-
const RegisterBankInfo &RBI)
95-
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
95+
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
96+
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
9697
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
9798
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
9899
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
99100

101+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
102+
// the cycle
103+
// Note: uniformity analysis does not consider that registers with vgpr def
104+
// are divergent (you can have uniform value in vgpr).
105+
// - TODO: implicit use of $exec could be implemented as indicator that
106+
// instruction is divergent
107+
bool isTemporalDivergenceCopy(Register Reg) {
108+
MachineInstr *MI = MRI.getVRegDef(Reg);
109+
if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
110+
return false;
111+
112+
return MI->implicit_operands().begin()->getReg() == TRI.getExec();
113+
}
114+
100115
const RegisterBank *getRegBankToAssign(Register Reg) {
101-
if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
116+
if (!isTemporalDivergenceCopy(Reg) &&
117+
(MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
102118
return SgprRB;
103119
if (MRI.getType(Reg) == LLT::scalar(1))
104120
return VccRB;
@@ -209,7 +225,7 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
209225
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
210226
MachineRegisterInfo &MRI = *B.getMRI();
211227
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
212-
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
228+
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(), *ST.getRegBankInfo());
213229
// Virtual registers at this point don't have register banks.
214230
// Virtual registers in def and use operands of already inst-selected
215231
// instruction have register class.

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "GCNSubtarget.h"
1616
#include "llvm/CodeGen/MachineBasicBlock.h"
1717
#include "llvm/CodeGen/MachinePostDominators.h"
18+
#include "llvm/CodeGen/MachineRegisterInfo.h"
1819
#include "llvm/CodeGen/MachineSSAUpdater.h"
1920

2021
namespace llvm {
@@ -72,6 +73,11 @@ class PhiLoweringHelper {
7273
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
7374
}
7475

76+
void
77+
initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
78+
LaneMaskRegAttrs = Attrs;
79+
}
80+
7581
bool isLaneMaskReg(Register Reg) const {
7682
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
7783
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ body: |
471471
; GFX10-NEXT: bb.2:
472472
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
473473
; GFX10-NEXT: {{ $}}
474-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
474+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
475475
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
476476
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
477477
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
@@ -486,6 +486,7 @@ body: |
486486
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
487487
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
488488
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
489+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
489490
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
490491
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
491492
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -497,11 +498,11 @@ body: |
497498
; GFX10-NEXT: bb.4:
498499
; GFX10-NEXT: successors: %bb.2(0x80000000)
499500
; GFX10-NEXT: {{ $}}
500-
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
501+
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
501502
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
502503
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
503504
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
504-
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505506
; GFX10-NEXT: G_BR %bb.2
506507
; GFX10-NEXT: {{ $}}
507508
; GFX10-NEXT: bb.5:

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,7 @@ body: |
544544
; GFX10-NEXT: {{ $}}
545545
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
546546
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
547+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo
547548
; GFX10-NEXT: {{ $}}
548549
; GFX10-NEXT: bb.2:
549550
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -567,8 +568,8 @@ body: |
567568
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
568569
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
569570
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
570-
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
571-
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
571+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
572+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
572573
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
573574
; GFX10-NEXT: G_BR %bb.5
574575
; GFX10-NEXT: {{ $}}
@@ -578,19 +579,19 @@ body: |
578579
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
579580
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
580581
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
581-
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
582-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
583-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
582+
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
583+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
584+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
584585
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
585586
; GFX10-NEXT: {{ $}}
586587
; GFX10-NEXT: bb.6:
587588
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
588589
; GFX10-NEXT: {{ $}}
589-
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
590+
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
590591
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
591-
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
592+
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
592593
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
593-
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), [[PHI]](s32)
594+
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32)
594595
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
595596
; GFX10-NEXT: G_BR %bb.7
596597
; GFX10-NEXT: {{ $}}
@@ -604,7 +605,7 @@ body: |
604605
; GFX10-NEXT: bb.8:
605606
; GFX10-NEXT: successors: %bb.9(0x80000000)
606607
; GFX10-NEXT: {{ $}}
607-
; GFX10-NEXT: G_STORE [[PHI1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
608+
; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
608609
; GFX10-NEXT: {{ $}}
609610
; GFX10-NEXT: bb.9:
610611
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,20 @@ define void @temporal_divergent_i32(float %val, ptr %addr) {
55
; GFX10-LABEL: temporal_divergent_i32:
66
; GFX10: ; %bb.0: ; %entry
77
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8-
; GFX10-NEXT: s_mov_b32 s4, -1
9-
; GFX10-NEXT: s_mov_b32 s5, 0
8+
; GFX10-NEXT: s_mov_b32 s5, -1
9+
; GFX10-NEXT: s_mov_b32 s4, 0
1010
; GFX10-NEXT: .LBB0_1: ; %loop
1111
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
12-
; GFX10-NEXT: s_add_i32 s4, s4, 1
13-
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s4
12+
; GFX10-NEXT: s_add_i32 s5, s5, 1
13+
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s5
1414
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0
15-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
16-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
15+
; GFX10-NEXT: v_mov_b32_e32 v3, s5
16+
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
17+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
1718
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
1819
; GFX10-NEXT: ; %bb.2: ; %exit
19-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
20-
; GFX10-NEXT: v_mov_b32_e32 v0, s4
21-
; GFX10-NEXT: flat_store_dword v[1:2], v0
20+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
21+
; GFX10-NEXT: flat_store_dword v[1:2], v3
2222
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2323
; GFX10-NEXT: s_setpc_b64 s[30:31]
2424
entry:

0 commit comments

Comments
 (0)