Skip to content

Commit 6567062

Browse files
arsenmeasyonaadit
authored andcommitted
AMDGPU: Handle v_add* in eliminateFrameIndex (llvm#102346)
1 parent 19261ad commit 6567062

18 files changed

+4299
-756
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 307 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2216,7 +2216,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22162216
assert(MF->getRegInfo().isReserved(MFI->getScratchRSrcReg()) &&
22172217
"unreserved scratch RSRC register");
22182218

2219-
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2219+
MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
22202220
int Index = MI->getOperand(FIOperandNum).getIndex();
22212221

22222222
Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
@@ -2445,7 +2445,299 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24452445
MI->eraseFromParent();
24462446
return true;
24472447
}
2448+
case AMDGPU::V_ADD_U32_e32:
2449+
case AMDGPU::V_ADD_U32_e64:
2450+
case AMDGPU::V_ADD_CO_U32_e32:
2451+
case AMDGPU::V_ADD_CO_U32_e64: {
2452+
// TODO: Handle sub, and, or.
2453+
unsigned NumDefs = MI->getNumExplicitDefs();
2454+
unsigned Src0Idx = NumDefs;
2455+
2456+
bool HasClamp = false;
2457+
MachineOperand *VCCOp = nullptr;
2458+
2459+
switch (MI->getOpcode()) {
2460+
case AMDGPU::V_ADD_U32_e32:
2461+
break;
2462+
case AMDGPU::V_ADD_U32_e64:
2463+
HasClamp = MI->getOperand(3).getImm();
2464+
break;
2465+
case AMDGPU::V_ADD_CO_U32_e32:
2466+
VCCOp = &MI->getOperand(3);
2467+
break;
2468+
case AMDGPU::V_ADD_CO_U32_e64:
2469+
VCCOp = &MI->getOperand(1);
2470+
HasClamp = MI->getOperand(4).getImm();
2471+
break;
2472+
default:
2473+
break;
2474+
}
2475+
bool DeadVCC = !VCCOp || VCCOp->isDead();
2476+
MachineOperand &DstOp = MI->getOperand(0);
2477+
Register DstReg = DstOp.getReg();
2478+
2479+
unsigned OtherOpIdx =
2480+
FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2481+
MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2482+
2483+
unsigned Src1Idx = Src0Idx + 1;
2484+
Register MaterializedReg = FrameReg;
2485+
Register ScavengedVGPR;
2486+
2487+
if (FrameReg && !ST.enableFlatScratch()) {
2488+
// We should just do an in-place update of the result register. However,
2489+
// the value there may also be used by the add, in which case we need a
2490+
// temporary register.
2491+
//
2492+
// FIXME: The scavenger is not finding the result register in the
2493+
// common case where the add does not read the register.
2494+
2495+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2496+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2497+
2498+
// TODO: If we have a free SGPR, it's sometimes better to use a scalar
2499+
// shift.
2500+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2501+
.addDef(ScavengedVGPR, RegState::Renamable)
2502+
.addImm(ST.getWavefrontSizeLog2())
2503+
.addReg(FrameReg);
2504+
MaterializedReg = ScavengedVGPR;
2505+
}
2506+
2507+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2508+
// For the non-immediate case, we could fall through to the default
2509+
// handling, but we do an in-place update of the result register here to
2510+
// avoid scavenging another register.
2511+
if (OtherOp->isImm()) {
2512+
OtherOp->setImm(OtherOp->getImm() + Offset);
2513+
Offset = 0;
2514+
}
2515+
2516+
if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2517+
if (ST.enableFlatScratch() &&
2518+
!TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2519+
// We didn't need the shift above, so we have an SGPR for the frame
2520+
// register, but may have a VGPR only operand.
2521+
//
2522+
// TODO: On gfx10+, we can easily change the opcode to the e64 version
2523+
// and use the higher constant bus restriction to avoid this copy.
2524+
2525+
if (!ScavengedVGPR) {
2526+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2527+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2528+
/*SPAdj=*/0);
2529+
}
2530+
2531+
assert(ScavengedVGPR != DstReg);
2532+
2533+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2534+
.addReg(MaterializedReg,
2535+
MaterializedReg != FrameReg ? RegState::Kill : 0);
2536+
MaterializedReg = ScavengedVGPR;
2537+
}
2538+
2539+
// TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2540+
// is not live, we could use a scalar add + vector add instead of 2
2541+
// vector adds.
2542+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2543+
.addDef(DstReg, RegState::Renamable);
2544+
if (NumDefs == 2)
2545+
AddI32.add(MI->getOperand(1));
2546+
2547+
unsigned MaterializedRegFlags =
2548+
MaterializedReg != FrameReg ? RegState::Kill : 0;
2549+
2550+
if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2551+
// If we know we have a VGPR already, it's more likely the other
2552+
// operand is a legal vsrc0.
2553+
AddI32
2554+
.add(*OtherOp)
2555+
.addReg(MaterializedReg, MaterializedRegFlags);
2556+
} else {
2557+
// Commute operands to avoid violating VOP2 restrictions. This will
2558+
// typically happen when using scratch.
2559+
AddI32
2560+
.addReg(MaterializedReg, MaterializedRegFlags)
2561+
.add(*OtherOp);
2562+
}
2563+
2564+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2565+
MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2566+
AddI32.addImm(0); // clamp
2567+
2568+
if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2569+
AddI32.setOperandDead(3); // Dead vcc
2570+
2571+
MaterializedReg = DstReg;
2572+
2573+
OtherOp->ChangeToRegister(MaterializedReg, false);
2574+
OtherOp->setIsKill(true);
2575+
FIOp->ChangeToImmediate(Offset);
2576+
Offset = 0;
2577+
} else if (Offset != 0) {
2578+
assert(!MaterializedReg);
2579+
FIOp->ChangeToImmediate(Offset);
2580+
Offset = 0;
2581+
} else {
2582+
if (DeadVCC && !HasClamp) {
2583+
assert(Offset == 0);
2584+
2585+
// TODO: Losing kills and implicit operands. Just mutate to copy and
2586+
// let lowerCopy deal with it?
2587+
if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2588+
// Folded to an identity copy.
2589+
MI->eraseFromParent();
2590+
return true;
2591+
}
2592+
2593+
// The immediate value should be in OtherOp
2594+
MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2595+
MI->removeOperand(FIOperandNum);
2596+
2597+
unsigned NumOps = MI->getNumOperands();
2598+
for (unsigned I = NumOps - 2; I >= 2; --I)
2599+
MI->removeOperand(I);
2600+
2601+
if (NumDefs == 2)
2602+
MI->removeOperand(1);
2603+
2604+
// The code below can't deal with a mov.
2605+
return true;
2606+
}
2607+
2608+
// This folded to a constant, but we have to keep the add around for
2609+
// pointless implicit defs or clamp modifier.
2610+
FIOp->ChangeToImmediate(0);
2611+
}
2612+
2613+
// Try to improve legality by commuting.
2614+
if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2615+
std::swap(FIOp, OtherOp);
2616+
std::swap(FIOperandNum, OtherOpIdx);
2617+
}
24482618

2619+
for (unsigned SrcIdx : {Src1Idx, Src0Idx}) {
2620+
// Depending on operand constraints we may need to insert another copy.
2621+
if (!TII->isOperandLegal(*MI, SrcIdx)) {
2622+
// If commuting didn't make the operands legal, we need to materialize
2623+
// in a register.
2624+
// TODO: Can use SGPR on gfx10+ in some cases.
2625+
if (!ScavengedVGPR) {
2626+
ScavengedVGPR = RS->scavengeRegisterBackwards(
2627+
AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2628+
/*SPAdj=*/0);
2629+
}
2630+
2631+
assert(ScavengedVGPR != DstReg);
2632+
2633+
MachineOperand &Src = MI->getOperand(SrcIdx);
2634+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2635+
.add(Src);
2636+
2637+
Src.ChangeToRegister(ScavengedVGPR, false);
2638+
Src.setIsKill(true);
2639+
}
2640+
}
2641+
2642+
// Fold out add of 0 case that can appear in kernels.
2643+
if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2644+
if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2645+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2646+
}
2647+
2648+
MI->eraseFromParent();
2649+
}
2650+
2651+
return true;
2652+
}
2653+
case AMDGPU::S_ADD_I32: {
2654+
// TODO: Handle s_or_b32, s_and_b32.
2655+
unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2656+
MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2657+
2658+
assert(FrameReg || MFI->isBottomOfStack());
2659+
2660+
MachineOperand &DstOp = MI->getOperand(0);
2661+
const DebugLoc &DL = MI->getDebugLoc();
2662+
Register MaterializedReg = FrameReg;
2663+
2664+
// Defend against live scc, which should never happen in practice.
2665+
bool DeadSCC = MI->getOperand(3).isDead();
2666+
2667+
Register TmpReg;
2668+
2669+
if (FrameReg && !ST.enableFlatScratch()) {
2670+
// FIXME: In the common case where the add does not also read its result
2671+
// (i.e. this isn't a reg += fi), it's not finding the dest reg as
2672+
// available.
2673+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI,
2674+
false, 0);
2675+
BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2676+
.addDef(TmpReg, RegState::Renamable)
2677+
.addReg(FrameReg)
2678+
.addImm(ST.getWavefrontSizeLog2())
2679+
.setOperandDead(3); // Set SCC dead
2680+
MaterializedReg = TmpReg;
2681+
}
2682+
2683+
int64_t Offset = FrameInfo.getObjectOffset(Index);
2684+
2685+
// For the non-immediate case, we could fall through to the default
2686+
// handling, but we do an in-place update of the result register here to
2687+
// avoid scavenging another register.
2688+
if (OtherOp.isImm()) {
2689+
OtherOp.setImm(OtherOp.getImm() + Offset);
2690+
Offset = 0;
2691+
2692+
if (MaterializedReg)
2693+
FIOp->ChangeToRegister(MaterializedReg, false);
2694+
else
2695+
FIOp->ChangeToImmediate(0);
2696+
} else if (MaterializedReg) {
2697+
// If we can't fold the other operand, do another increment.
2698+
Register DstReg = DstOp.getReg();
2699+
2700+
if (!TmpReg && MaterializedReg == FrameReg) {
2701+
TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2702+
MI, false, 0);
2703+
DstReg = TmpReg;
2704+
}
2705+
2706+
auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2707+
.addDef(DstReg, RegState::Renamable)
2708+
.addReg(MaterializedReg, RegState::Kill)
2709+
.add(OtherOp);
2710+
if (DeadSCC)
2711+
AddI32.setOperandDead(3);
2712+
2713+
MaterializedReg = DstReg;
2714+
2715+
OtherOp.ChangeToRegister(MaterializedReg, false);
2716+
OtherOp.setIsKill(true);
2717+
OtherOp.setIsRenamable(true);
2718+
FIOp->ChangeToImmediate(Offset);
2719+
} else {
2720+
// If we don't have any other offset to apply, we can just directly
2721+
// interpret the frame index as the offset.
2722+
FIOp->ChangeToImmediate(Offset);
2723+
}
2724+
2725+
if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2726+
assert(Offset == 0);
2727+
MI->removeOperand(3);
2728+
MI->removeOperand(OtherOpIdx);
2729+
MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2730+
} else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2731+
assert(Offset == 0);
2732+
MI->removeOperand(3);
2733+
MI->removeOperand(FIOperandNum);
2734+
MI->setDesc(
2735+
TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2736+
}
2737+
2738+
assert(!FIOp->isFI());
2739+
return true;
2740+
}
24492741
default: {
24502742
// Other access to frame index
24512743
const DebugLoc &DL = MI->getDebugLoc();
@@ -2459,7 +2751,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
24592751

24602752
// The offset is always swizzled, just replace it
24612753
if (FrameReg)
2462-
FIOp.ChangeToRegister(FrameReg, false);
2754+
FIOp->ChangeToRegister(FrameReg, false);
24632755

24642756
MachineOperand *OffsetOp =
24652757
TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
@@ -2512,18 +2804,18 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25122804
}
25132805

25142806
if (!FrameReg) {
2515-
FIOp.ChangeToImmediate(Offset);
2516-
if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2807+
FIOp->ChangeToImmediate(Offset);
2808+
if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
25172809
return false;
25182810
}
25192811

25202812
// We need to use register here. Check if we can use an SGPR or need
25212813
// a VGPR.
2522-
FIOp.ChangeToRegister(AMDGPU::M0, false);
2523-
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2814+
FIOp->ChangeToRegister(AMDGPU::M0, false);
2815+
bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
25242816

25252817
if (!Offset && FrameReg && UseSGPR) {
2526-
FIOp.setReg(FrameReg);
2818+
FIOp->setReg(FrameReg);
25272819
return false;
25282820
}
25292821

@@ -2532,8 +2824,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25322824

25332825
Register TmpReg =
25342826
RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2535-
FIOp.setReg(TmpReg);
2536-
FIOp.setIsKill();
2827+
FIOp->setReg(TmpReg);
2828+
FIOp->setIsKill();
25372829

25382830
if ((!FrameReg || !Offset) && TmpReg) {
25392831
unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2562,8 +2854,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
25622854
if (!TmpSReg) {
25632855
// Use frame register and restore it after.
25642856
TmpSReg = FrameReg;
2565-
FIOp.setReg(FrameReg);
2566-
FIOp.setIsKill(false);
2857+
FIOp->setReg(FrameReg);
2858+
FIOp->setIsKill(false);
25672859
}
25682860

25692861
if (NeedSaveSCC) {
@@ -2802,7 +3094,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28023094
MI->eraseFromParent();
28033095
return true;
28043096
}
2805-
FIOp.ChangeToRegister(ResultReg, false, false, true);
3097+
FIOp->ChangeToRegister(ResultReg, false, false, true);
28063098
return false;
28073099
}
28083100

@@ -2833,13 +3125,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
28333125
// If the offset is simply too big, don't convert to a scratch wave offset
28343126
// relative index.
28353127

2836-
FIOp.ChangeToImmediate(Offset);
2837-
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
3128+
FIOp->ChangeToImmediate(Offset);
3129+
if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
28383130
Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
28393131
MI, false, 0);
28403132
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
28413133
.addImm(Offset);
2842-
FIOp.ChangeToRegister(TmpReg, false, false, true);
3134+
FIOp->ChangeToRegister(TmpReg, false, false, true);
28433135
}
28443136
}
28453137
}

0 commit comments

Comments
 (0)