Skip to content

Commit a5c340d

Browse files
AMDGPU/GlobalISel: Temporal divergence lowering (non i1)
Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source.
1 parent cd3d069 commit a5c340d

12 files changed

+146
-41
lines changed

llvm/include/llvm/ADT/GenericUniformityImpl.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
342342
typename SyncDependenceAnalysisT::DivergenceDescriptor;
343343
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
344344

345+
using TemporalDivergenceTuple =
346+
std::tuple<InstructionT *, InstructionT *, const CycleT *>;
347+
345348
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
346349
const TargetTransformInfo *TTI)
347350
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
@@ -396,6 +399,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
396399

397400
void print(raw_ostream &out) const;
398401

402+
SmallVector<TemporalDivergenceTuple, 8> TemporalDivergenceList;
403+
404+
void recordTemporalDivergence(const InstructionT *, const InstructionT *,
405+
const CycleT *);
406+
399407
protected:
400408
/// \brief Value/block pair representing a single phi input.
401409
struct PhiInput {
@@ -1129,6 +1137,13 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
11291137
}
11301138
}
11311139

1140+
template <typename ContextT>
1141+
void GenericUniformityAnalysisImpl<ContextT>::recordTemporalDivergence(
1142+
const InstructionT *Inst, const InstructionT *User, const CycleT *Cycle) {
1143+
TemporalDivergenceList.emplace_back(const_cast<InstructionT *>(Inst),
1144+
const_cast<InstructionT *>(User), Cycle);
1145+
}
1146+
11321147
template <typename ContextT>
11331148
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
11341149
const InstructionT &Instr) const {
@@ -1180,6 +1195,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11801195
}
11811196
}
11821197

1198+
if (!TemporalDivergenceList.empty()) {
1199+
OS << "\nTEMPORAL DIVERGENCE LIST:\n";
1200+
1201+
for (auto [Inst, UseInst, Cycle] : TemporalDivergenceList) {
1202+
OS << "Inst :" << Context.print(Inst)
1203+
<< "Used by :" << Context.print(UseInst)
1204+
<< "Outside cycle :" << Cycle->print(Context) << "\n\n";
1205+
}
1206+
}
1207+
11831208
for (auto &block : F) {
11841209
OS << "\nBLOCK " << Context.print(&block) << '\n';
11851210

@@ -1210,6 +1235,14 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
12101235
}
12111236
}
12121237

1238+
template <typename ContextT>
1239+
iterator_range<
1240+
typename GenericUniformityInfo<ContextT>::TemporalDivergenceTuple *>
1241+
GenericUniformityInfo<ContextT>::getTemporalDivergenceList() const {
1242+
return make_range(DA->TemporalDivergenceList.begin(),
1243+
DA->TemporalDivergenceList.end());
1244+
}
1245+
12131246
template <typename ContextT>
12141247
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
12151248
return DA->hasDivergence();

llvm/include/llvm/ADT/GenericUniformityInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ template <typename ContextT> class GenericUniformityInfo {
4040
using CycleInfoT = GenericCycleInfo<ContextT>;
4141
using CycleT = typename CycleInfoT::CycleT;
4242

43+
using TemporalDivergenceTuple =
44+
std::tuple<InstructionT *, InstructionT *, const CycleT *>;
45+
4346
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
4447
const TargetTransformInfo *TTI = nullptr);
4548
GenericUniformityInfo() = default;
@@ -78,6 +81,8 @@ template <typename ContextT> class GenericUniformityInfo {
7881

7982
void print(raw_ostream &Out) const;
8083

84+
iterator_range<TemporalDivergenceTuple *> getTemporalDivergenceList() const;
85+
8186
private:
8287
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
8388

llvm/lib/Analysis/UniformityAnalysis.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,12 @@ template <>
7979
void llvm::GenericUniformityAnalysisImpl<
8080
SSAContext>::propagateTemporalDivergence(const Instruction &I,
8181
const Cycle &DefCycle) {
82-
if (isDivergent(I))
83-
return;
8482
for (auto *User : I.users()) {
8583
auto *UserInstr = cast<Instruction>(User);
8684
if (DefCycle.contains(UserInstr->getParent()))
8785
continue;
8886
markDivergent(*UserInstr);
87+
recordTemporalDivergence(&I, UserInstr, &DefCycle);
8988
}
9089
}
9190

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
117117
if (!Op.getReg().isVirtual())
118118
continue;
119119
auto Reg = Op.getReg();
120-
if (isDivergent(Reg))
121-
continue;
122120
for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
123121
if (DefCycle.contains(UserInstr.getParent()))
124122
continue;
125123
markDivergent(UserInstr);
124+
125+
recordTemporalDivergence(&I, &UserInstr, &DefCycle);
126126
}
127127
}
128128
}
@@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
193193

194194
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
195195
AU.setPreservesAll();
196-
AU.addRequired<MachineCycleInfoWrapperPass>();
196+
AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
197197
AU.addRequired<MachineDominatorTreeWrapperPass>();
198198
MachineFunctionPass::getAnalysisUsage(AU);
199199
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
1920
#include "SILowerI1Copies.h"
2021
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2122
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -45,7 +46,6 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4546
}
4647

4748
void getAnalysisUsage(AnalysisUsage &AU) const override {
48-
AU.setPreservesCFG();
4949
AU.addRequired<MachineDominatorTreeWrapperPass>();
5050
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
5151
AU.addRequired<MachineUniformityAnalysisPass>();
@@ -78,6 +78,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
7878
Register DstReg, Register PrevReg,
7979
Register CurReg) override;
8080
void constrainAsLaneMask(Incoming &In) override;
81+
82+
bool lowerTemporalDivergence();
8183
};
8284

8385
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -188,6 +190,35 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
188190
In.Reg = Copy.getReg(0);
189191
}
190192

193+
void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
194+
Register NewReg) {
195+
for (MachineOperand &Op : Inst->operands()) {
196+
if (Op.isReg() && Op.getReg() == Reg)
197+
Op.setReg(NewReg);
198+
}
199+
}
200+
201+
bool DivergenceLoweringHelper::lowerTemporalDivergence() {
202+
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
203+
204+
for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) {
205+
Register Reg = Inst->getOperand(0).getReg();
206+
if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
207+
ILMA.isS32S64LaneMask(Reg))
208+
continue;
209+
210+
MachineBasicBlock *MBB = Inst->getParent();
211+
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
212+
213+
Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
214+
B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
215+
.addUse(ExecReg, RegState::Implicit);
216+
217+
replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
218+
}
219+
return false;
220+
}
221+
191222
} // End anonymous namespace.
192223

193224
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -218,5 +249,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
218249

219250
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
220251

221-
return Helper.lowerPhis();
252+
bool Changed = false;
253+
// Temporal divergence lowering needs to inspect list of instructions used
254+
// outside cycle with divergent exit provided by uniformity analysis. Uniform
255+
// instructions from the list require lowering, no instruction is deleted.
256+
// Thus it needs to be run before lowerPhis that deletes phis that require
257+
// lowering and replaces them with new instructions.
258+
259+
// Non-i1 temporal divergence lowering.
260+
Changed |= Helper.lowerTemporalDivergence();
261+
Changed |= Helper.lowerPhis();
262+
return Changed;
222263
}

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class RegBankSelectHelper {
8383
MachineRegisterInfo &MRI;
8484
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
8585
const MachineUniformityInfo &MUI;
86+
const SIRegisterInfo &TRI;
8687
const RegisterBank *SgprRB;
8788
const RegisterBank *VgprRB;
8889
const RegisterBank *VccRB;
@@ -91,14 +92,29 @@ class RegBankSelectHelper {
9192
RegBankSelectHelper(MachineIRBuilder &B,
9293
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
9394
const MachineUniformityInfo &MUI,
94-
const RegisterBankInfo &RBI)
95-
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
95+
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
96+
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
9697
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
9798
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
9899
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
99100

101+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
102+
// the cycle
103+
// Note: uniformity analysis does not consider that registers with vgpr def
104+
// are divergent (you can have uniform value in vgpr).
105+
// - TODO: implicit use of $exec could be implemented as indicator that
106+
// instruction is divergent
107+
bool isTemporalDivergenceCopy(Register Reg) {
108+
MachineInstr *MI = MRI.getVRegDef(Reg);
109+
if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
110+
return false;
111+
112+
return MI->implicit_operands().begin()->getReg() == TRI.getExec();
113+
}
114+
100115
const RegisterBank *getRegBankToAssign(Register Reg) {
101-
if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
116+
if (!isTemporalDivergenceCopy(Reg) &&
117+
(MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
102118
return SgprRB;
103119
if (MRI.getType(Reg) == LLT::scalar(1))
104120
return VccRB;
@@ -209,7 +225,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
209225
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
210226
MachineRegisterInfo &MRI = *B.getMRI();
211227
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
212-
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
228+
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
229+
*ST.getRegBankInfo());
213230
// Virtual registers at this point don't have register banks.
214231
// Virtual registers in def and use operands of already inst-selected
215232
// instruction have register class.

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "GCNSubtarget.h"
1616
#include "llvm/CodeGen/MachineBasicBlock.h"
1717
#include "llvm/CodeGen/MachinePostDominators.h"
18+
#include "llvm/CodeGen/MachineRegisterInfo.h"
1819
#include "llvm/CodeGen/MachineSSAUpdater.h"
1920

2021
namespace llvm {
@@ -72,6 +73,11 @@ class PhiLoweringHelper {
7273
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
7374
}
7475

76+
void
77+
initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
78+
LaneMaskRegAttrs = Attrs;
79+
}
80+
7581
bool isLaneMaskReg(Register Reg) const {
7682
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
7783
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ body: |
471471
; GFX10-NEXT: bb.2:
472472
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
473473
; GFX10-NEXT: {{ $}}
474-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
474+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
475475
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
476476
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
477477
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
@@ -486,6 +486,7 @@ body: |
486486
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
487487
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
488488
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
489+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
489490
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
490491
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
491492
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -497,11 +498,11 @@ body: |
497498
; GFX10-NEXT: bb.4:
498499
; GFX10-NEXT: successors: %bb.2(0x80000000)
499500
; GFX10-NEXT: {{ $}}
500-
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
501+
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
501502
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
502503
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
503504
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
504-
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505506
; GFX10-NEXT: G_BR %bb.2
506507
; GFX10-NEXT: {{ $}}
507508
; GFX10-NEXT: bb.5:

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,7 @@ body: |
544544
; GFX10-NEXT: {{ $}}
545545
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
546546
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
547+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo
547548
; GFX10-NEXT: {{ $}}
548549
; GFX10-NEXT: bb.2:
549550
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -567,8 +568,8 @@ body: |
567568
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
568569
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
569570
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
570-
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
571-
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
571+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
572+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
572573
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
573574
; GFX10-NEXT: G_BR %bb.5
574575
; GFX10-NEXT: {{ $}}
@@ -578,19 +579,19 @@ body: |
578579
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
579580
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
580581
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
581-
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
582-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
583-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
582+
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
583+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
584+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
584585
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
585586
; GFX10-NEXT: {{ $}}
586587
; GFX10-NEXT: bb.6:
587588
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
588589
; GFX10-NEXT: {{ $}}
589-
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
590+
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
590591
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
591-
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
592+
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
592593
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
593-
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), [[PHI]](s32)
594+
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32)
594595
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
595596
; GFX10-NEXT: G_BR %bb.7
596597
; GFX10-NEXT: {{ $}}
@@ -604,7 +605,7 @@ body: |
604605
; GFX10-NEXT: bb.8:
605606
; GFX10-NEXT: successors: %bb.9(0x80000000)
606607
; GFX10-NEXT: {{ $}}
607-
; GFX10-NEXT: G_STORE [[PHI1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
608+
; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
608609
; GFX10-NEXT: {{ $}}
609610
; GFX10-NEXT: bb.9:
610611
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,20 @@ define void @temporal_divergent_i32(float %val, ptr %addr) {
55
; GFX10-LABEL: temporal_divergent_i32:
66
; GFX10: ; %bb.0: ; %entry
77
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8-
; GFX10-NEXT: s_mov_b32 s4, -1
9-
; GFX10-NEXT: s_mov_b32 s5, 0
8+
; GFX10-NEXT: s_mov_b32 s5, -1
9+
; GFX10-NEXT: s_mov_b32 s4, 0
1010
; GFX10-NEXT: .LBB0_1: ; %loop
1111
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
12-
; GFX10-NEXT: s_add_i32 s4, s4, 1
13-
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s4
12+
; GFX10-NEXT: s_add_i32 s5, s5, 1
13+
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s5
1414
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0
15-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
16-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
15+
; GFX10-NEXT: v_mov_b32_e32 v3, s5
16+
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
17+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
1718
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
1819
; GFX10-NEXT: ; %bb.2: ; %exit
19-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
20-
; GFX10-NEXT: v_mov_b32_e32 v0, s4
21-
; GFX10-NEXT: flat_store_dword v[1:2], v0
20+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
21+
; GFX10-NEXT: flat_store_dword v[1:2], v3
2222
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2323
; GFX10-NEXT: s_setpc_b64 s[30:31]
2424
entry:

0 commit comments

Comments
 (0)