Skip to content

AMDGPU/GlobalISel: Temporal divergence lowering (non i1) #124298

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions llvm/include/llvm/ADT/GenericUniformityImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/Support/raw_ostream.h"

#define DEBUG_TYPE "uniformity"
Expand Down Expand Up @@ -342,6 +343,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
typename SyncDependenceAnalysisT::DivergenceDescriptor;
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;

using TemporalDivergenceTuple =
std::tuple<ConstValueRefT, InstructionT *, const CycleT *>;

GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI)
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
Expand Down Expand Up @@ -396,6 +400,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {

void print(raw_ostream &out) const;

SmallVector<TemporalDivergenceTuple, 8> TemporalDivergenceList;

void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
const CycleT *);

protected:
/// \brief Value/block pair representing a single phi input.
struct PhiInput {
Expand Down Expand Up @@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
}
}

template <typename ContextT>
void GenericUniformityAnalysisImpl<ContextT>::recordTemporalDivergence(
ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) {
TemporalDivergenceList.emplace_back(Val, const_cast<InstructionT *>(User),
Cycle);
}

template <typename ContextT>
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
const InstructionT &Instr) const {
Expand All @@ -1146,6 +1162,12 @@ template <typename ContextT>
void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
bool haveDivergentArgs = false;

// When we print Value, LLVM IR instruction, we want to print extra new line.
// In LLVM IR print function for Value does not print new line at the end.
// In MIR print for MachineInstr prints new line at the end.
constexpr bool IsMIR = std::is_same<InstructionT, MachineInstr>::value;
std::string NewLine = IsMIR ? "" : "\n";

// Control flow instructions may be divergent even if their inputs are
// uniform. Thus, although exceedingly rare, it is possible to have a program
// with no divergent values but with divergent control structures.
Expand Down Expand Up @@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}

if (!TemporalDivergenceList.empty()) {
OS << "\nTEMPORAL DIVERGENCE LIST:\n";

for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) {
OS << "Value :" << Context.print(Val) << NewLine
<< "Used by :" << Context.print(UseInst) << NewLine
<< "Outside cycle :" << Cycle->print(Context) << "\n\n";
}
}

for (auto &block : F) {
OS << "\nBLOCK " << Context.print(&block) << '\n';

Expand All @@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
OS << " DIVERGENT: ";
else
OS << " ";
OS << Context.print(value) << '\n';
OS << Context.print(value) << NewLine;
}

OS << "TERMINATORS\n";
Expand All @@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
OS << " DIVERGENT: ";
else
OS << " ";
OS << Context.print(T) << '\n';
OS << Context.print(T) << NewLine;
}

OS << "END BLOCK\n";
}
}

template <typename ContextT>
iterator_range<
typename GenericUniformityInfo<ContextT>::TemporalDivergenceTuple *>
GenericUniformityInfo<ContextT>::getTemporalDivergenceList() const {
return make_range(DA->TemporalDivergenceList.begin(),
DA->TemporalDivergenceList.end());
}

template <typename ContextT>
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
return DA->hasDivergence();
Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/ADT/GenericUniformityInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ template <typename ContextT> class GenericUniformityInfo {
using CycleInfoT = GenericCycleInfo<ContextT>;
using CycleT = typename CycleInfoT::CycleT;

using TemporalDivergenceTuple =
std::tuple<ConstValueRefT, InstructionT *, const CycleT *>;

GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI = nullptr);
GenericUniformityInfo() = default;
Expand Down Expand Up @@ -78,6 +81,8 @@ template <typename ContextT> class GenericUniformityInfo {

void print(raw_ostream &Out) const;

iterator_range<TemporalDivergenceTuple *> getTemporalDivergenceList() const;

private:
using ImplT = GenericUniformityAnalysisImpl<ContextT>;

Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Analysis/UniformityAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,12 @@ template <>
void llvm::GenericUniformityAnalysisImpl<
SSAContext>::propagateTemporalDivergence(const Instruction &I,
const Cycle &DefCycle) {
if (isDivergent(I))
return;
for (auto *User : I.users()) {
auto *UserInstr = cast<Instruction>(User);
if (DefCycle.contains(UserInstr->getParent()))
continue;
markDivergent(*UserInstr);
recordTemporalDivergence(&I, UserInstr, &DefCycle);
}
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
if (!Op.getReg().isVirtual())
continue;
auto Reg = Op.getReg();
if (isDivergent(Reg))
continue;
for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
if (DefCycle.contains(UserInstr.getParent()))
continue;
markDivergent(UserInstr);

recordTemporalDivergence(Reg, &UserInstr, &DefCycle);
}
}
}
Expand Down Expand Up @@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",

void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
AU.addRequired<MachineCycleInfoWrapperPass>();
AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
Expand Down
52 changes: 51 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "SILowerI1Copies.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
Expand Down Expand Up @@ -78,6 +79,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
Register DstReg, Register PrevReg,
Register CurReg) override;
void constrainAsLaneMask(Incoming &In) override;

bool lowerTemporalDivergence();
};

DivergenceLoweringHelper::DivergenceLoweringHelper(
Expand Down Expand Up @@ -188,6 +191,43 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
In.Reg = Copy.getReg(0);
}

void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
Register NewReg) {
for (MachineOperand &Op : Inst->operands()) {
if (Op.isReg() && Op.getReg() == Reg)
Op.setReg(NewReg);
}
}

bool DivergenceLoweringHelper::lowerTemporalDivergence() {
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
DenseMap<Register, Register> TDCache;

for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
ILMA.isS32S64LaneMask(Reg))
continue;

Register CachedTDCopy = TDCache.lookup(Reg);
if (CachedTDCopy) {
replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
continue;
}

MachineInstr *Inst = MRI->getVRegDef(Reg);
MachineBasicBlock *MBB = Inst->getParent();
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));

Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure how it works in global-isel, can we set the RegisterClass of VgprReg to vector register here to make it more obvious this is copy from sgpr to vgpr?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It unnecessarily complicates new Reg bank select, regbankselect will set vgpr there. Also copy has implicit exec, should be special enough to indicate what we are doing.

B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
.addUse(ExecReg, RegState::Implicit);

replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
TDCache[Reg] = VgprReg;
}
return false;
}

} // End anonymous namespace.

INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
Expand Down Expand Up @@ -218,5 +258,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(

DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);

return Helper.lowerPhis();
bool Changed = false;
// Temporal divergence lowering needs to inspect list of instructions used
// outside cycle with divergent exit provided by uniformity analysis. Uniform
// instructions from the list require lowering, no instruction is deleted.
// Thus it needs to be run before lowerPhis that deletes phis that require
// lowering and replaces them with new instructions.

// Non-i1 temporal divergence lowering.
Changed |= Helper.lowerTemporalDivergence();
Changed |= Helper.lowerPhis();
return Changed;
}
25 changes: 21 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class RegBankSelectHelper {
MachineRegisterInfo &MRI;
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
const MachineUniformityInfo &MUI;
const SIRegisterInfo &TRI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
Expand All @@ -91,14 +92,29 @@ class RegBankSelectHelper {
RegBankSelectHelper(MachineIRBuilder &B,
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
const MachineUniformityInfo &MUI,
const RegisterBankInfo &RBI)
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}

// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
// the cycle
// Note: uniformity analysis does not consider that registers with vgpr def
// are divergent (you can have uniform value in vgpr).
// - TODO: implicit use of $exec could be implemented as indicator that
// instruction is divergent
bool isTemporalDivergenceCopy(Register Reg) {
MachineInstr *MI = MRI.getVRegDef(Reg);
if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
return false;

return MI->implicit_operands().begin()->getReg() == TRI.getExec();
}

const RegisterBank *getRegBankToAssign(Register Reg) {
if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
if (!isTemporalDivergenceCopy(Reg) &&
(MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
return SgprRB;
if (MRI.getType(Reg) == LLT::scalar(1))
return VccRB;
Expand Down Expand Up @@ -209,7 +225,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
MachineRegisterInfo &MRI = *B.getMRI();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
*ST.getRegBankInfo());
// Virtual registers at this point don't have register banks.
// Virtual registers in def and use operands of already inst-selected
// instruction have register class.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SILowerI1Copies.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"

namespace llvm {
Expand Down Expand Up @@ -72,6 +73,11 @@ class PhiLoweringHelper {
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
}

void
initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
LaneMaskRegAttrs = Attrs;
}

bool isLaneMaskReg(Register Reg) const {
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
Expand All @@ -486,6 +486,7 @@ body: |
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
Expand All @@ -497,11 +498,11 @@ body: |
; GFX10-NEXT: bb.4:
; GFX10-NEXT: successors: %bb.2(0x80000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.5:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
Expand All @@ -665,8 +666,8 @@ body: |
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.5
; GFX10-NEXT: {{ $}}
Expand All @@ -676,19 +677,19 @@ body: |
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.6:
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), [[PHI]](s32)
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32)
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.7
; GFX10-NEXT: {{ $}}
Expand All @@ -702,7 +703,7 @@ body: |
; GFX10-NEXT: bb.8:
; GFX10-NEXT: successors: %bb.9(0x80000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: G_STORE [[PHI1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.9:
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
Expand Down
Loading
Loading