-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Allow rematerialization of instructions with virtual register uses #124327
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
Outdated
Show resolved
Hide resolved
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesRemove the restriction that scheduling rematerialization candidates cannot have virtual reg uses. Currently, this only allows for virtual reg uses which are already live at the rematerialization point, so bring in allUsesAvailableAt to check for this condition. Because of this condition, the uses of the remats will already be live in to the region, so the remat won't increase live-in pressure. Add an expensive check to check this condition. Patch is 89.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124327.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b00105ae9bd528..891f1dbed5d4b2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1615,6 +1615,59 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
+/// allUsesAvailableAt - Return true if all registers used by InstToRemat at
+/// OriginalIdx are also available with the same value at RematIdx.
+bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
+ SlotIndex OriginalIdx,
+ SlotIndex RematIdx) const {
+
+ LiveIntervals *LIS = DAG.LIS;
+ MachineRegisterInfo &MRI = DAG.MRI;
+ OriginalIdx = OriginalIdx.getRegSlot(true);
+ RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
+ for (const MachineOperand &MO : InstToRemat->operands()) {
+ if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+ continue;
+
+ if (!MO.getReg().isVirtual())
+ continue;
+
+ LiveInterval &LI = LIS->getInterval(MO.getReg());
+ const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
+ assert(OVNI);
+
+ // Don't allow rematerialization immediately after the original def.
+ // It would be incorrect if InstToRemat redefines the register.
+ // See PR14098.
+ if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
+ return false;
+
+ if (OVNI != LI.getVNInfoAt(RematIdx))
+ return false;
+
+ // Check that subrange is live at RematIdx.
+ if (LI.hasSubRanges()) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ unsigned SubReg = MO.getSubReg();
+ LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+ : MRI.getMaxLaneMaskForVReg(MO.getReg());
+ for (LiveInterval::SubRange &SR : LI.subranges()) {
+ if ((SR.LaneMask & LM).none())
+ continue;
+ if (!SR.liveAt(RematIdx))
+ return false;
+
+ // Early exit if all used lanes are checked. No need to continue.
+ LM &= ~SR.LaneMask;
+ if (LM.none())
+ break;
+ }
+ assert(LM.none());
+ }
+ }
+ return true;
+}
+
void PreRARematStage::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
@@ -1629,7 +1682,7 @@ void PreRARematStage::collectRematerializableInstructions() {
MachineOperand *Op = DAG.MRI.getOneDef(Reg);
MachineInstr *Def = Op->getParent();
- if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+ if (Op->getSubReg() != 0 || !DAG.TII->isTriviallyReMaterializable(*Def))
continue;
MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
@@ -1644,8 +1697,13 @@ void PreRARematStage::collectRematerializableInstructions() {
auto It = DAG.LiveIns[I].find(Reg);
if (It != DAG.LiveIns[I].end() && !It->second.none()) {
if (DAG.RegionsWithMinOcc[I]) {
- RematerializableInsts[I][Def] = UseI;
- AddedToRematList = true;
+ SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
+ SlotIndex UseIdx =
+ DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
+ if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
+ RematerializableInsts[I][Def] = UseI;
+ AddedToRematList = true;
+ }
}
// Collect regions with rematerializable reg as live-in to avoid
@@ -1719,6 +1777,27 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
Register DefReg = Def->getOperand(0).getReg();
TotalSinkableRegs +=
SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
+#ifdef EXPENSIVE_CHECKS
+ // All uses are known to be available / live at the remat point. Thus, the
+ // uses should already be live in to the region.
+ for (MachineOperand &MO : Def->operands()) {
+ if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+ continue;
+
+ Register UseReg = MO.getReg();
+ if (!UseReg.isVirtual())
+ continue;
+
+ LiveInterval &LI = LIS->getInterval(UseReg);
+ LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
+ if (LI.hasSubRanges() && MO.getSubReg())
+ LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
+
+ assert(NewLiveIns[I].contains(UseReg));
+ LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
+ assert((LiveInMask & LM) == LM);
+ }
+#endif
}
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
@@ -1842,18 +1921,6 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
return true;
}
-// Copied from MachineLICM
-bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
- if (!DAG.TII->isTriviallyReMaterializable(MI))
- return false;
-
- for (const MachineOperand &MO : MI.all_uses())
- if (MO.getReg().isVirtual())
- return false;
-
- return true;
-}
-
// When removing, we will have to check both beginning and ending of the region.
// When inserting, we will only have to check if we are inserting NewMI in front
// of a scheduling region and do not need to check the ending since we will only
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 44db834a41f828..38bb16d5b9b056 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -448,14 +448,15 @@ class PreRARematStage : public GCNSchedStage {
// and single use outside the defining block into RematerializableInsts.
void collectRematerializableInstructions();
- bool isTriviallyReMaterializable(const MachineInstr &MI);
-
// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
// Attempt to reduce RP of VGPR by sinking trivially rematerializable
// instructions. Returns true if we were able to sink instruction(s).
bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII);
+ bool allUsesAvailableAt(const MachineInstr *InstToRemat,
+ SlotIndex OriginalIdx, SlotIndex RematIdx) const;
+
public:
bool initGCNSchedStage() override;
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 9f264de531950b..7252ccfb836de0 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -84,13 +84,11 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
S_NOP 0
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -191,14 +189,12 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %24
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %23
S_NOP 0, implicit %0, implicit %1
@@ -300,7 +296,6 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -308,7 +303,6 @@ body: |
S_NOP 0, implicit %23
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -408,7 +402,6 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -416,7 +409,6 @@ body: |
S_NOP 0, implicit %22, implicit %23
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -529,7 +521,6 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -537,14 +528,12 @@ body: |
S_NOP 0, implicit %23
bb.2:
- ; predcessors: %bb.1
successors: %bb.3
%25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
S_NOP 0
bb.3:
- ; predecessors: %bb.2
successors: %bb.4
%26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
@@ -552,7 +541,6 @@ body: |
S_NOP 0, implicit %25
bb.4:
- ; predcessors: %bb.3
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -666,7 +654,6 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -674,7 +661,6 @@ body: |
S_NOP 0, implicit %23, implicit %22
bb.2:
- ; predcessors: %bb.1
successors: %bb.3
%25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
@@ -682,7 +668,6 @@ body: |
S_NOP 0
bb.3:
- ; predecessors: %bb.2
successors: %bb.4
%27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
@@ -690,7 +675,6 @@ body: |
S_NOP 0, implicit %25, implicit %26
bb.4:
- ; predcessors: %bb.3
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -949,14 +933,12 @@ body: |
undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %23
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -1053,7 +1035,6 @@ body: |
undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
@@ -1062,7 +1043,6 @@ body: |
S_NOP 0, implicit %21
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -1581,7 +1561,6 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -1589,7 +1568,6 @@ body: |
S_NOP 0, implicit %24, implicit %25
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %23
S_NOP 0, implicit %0, implicit %1
@@ -2528,14 +2506,12 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %24
bb.2:
- ; predcessors: %bb.1
successors: %bb.3
S_NOP 0, implicit %23
@@ -2543,7 +2519,6 @@ body: |
S_NOP 0
bb.3:
- ; predecessors: %bb.2
successors: %bb.4
%26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
@@ -2551,7 +2526,6 @@ body: |
S_NOP 0, implicit %26, implicit %27
bb.4:
- ; predcessors: %bb.3
S_NOP 0, implicit %25
S_NOP 0, implicit %0, implicit %1
@@ -2650,7 +2624,6 @@ body: |
%21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -2658,7 +2631,6 @@ body: |
S_NOP 0, implicit %21
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -2759,7 +2731,6 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -2767,7 +2738,6 @@ body: |
S_NOP 0, implicit %23
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %23
S_NOP 0, implicit %0, implicit %1
@@ -5030,7 +5000,6 @@ body: |
%21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5038,7 +5007,6 @@ body: |
S_NOP 0, implicit %21
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -5137,14 +5105,12 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %23, implicit %24
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -5242,7 +5208,6 @@ body: |
%22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5250,7 +5215,6 @@ body: |
S_NOP 0, implicit %22
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -5348,7 +5312,6 @@ body: |
%22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5357,7 +5320,6 @@ body: |
S_NOP 0, implicit %22
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -5456,7 +5418,6 @@ body: |
%22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5466,7 +5427,6 @@ body: |
S_NOP 0, implicit %22
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -5562,14 +5522,12 @@ body: |
%22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %22, implicit %23
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %0, implicit %1
S_NOP 0, implicit %2, implicit %3
@@ -5669,14 +5627,12 @@ body: |
undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %24
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %23.sub1
S_NOP 0, implicit %0, implicit %1
@@ -5779,14 +5735,12 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %24
bb.2:
- ; predcessors: %bb.1
DBG_VALUE %23, 0, 0
S_NOP 0, implicit %23
@@ -5889,14 +5843,12 @@ body: |
%23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
bb.1:
- ; predecessors: %bb.0
successors: %bb.2
%24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
S_NOP 0, implicit %24
bb.2:
- ; predcessors: %bb.1
S_NOP 0, implicit %23
S_NOP 0, implicit %0, implicit %1
@@ -5914,3 +5866,1011 @@ body: |
S_ENDPGM 0
...
+---
+name: remat_virtual_vgpr_occ_6
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX908-LABEL: name: remat_virtual_vgpr_occ_6
+ ; GFX908: bb.0:
+ ; GFX908-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+ ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX908-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+ ; GFX908-NEXT: [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[D...
[truncated]
|
…r uses Change-Id: I638fae40762a7f6b9095c50090a247554632eb94
e134b60
to
91c69c5
Compare
Bring in #124366 to handle PhysReg uses in potential Remat candidates
|
Change-Id: I69ea68a41b86026e600ac2a4a6b5c351c6753893
In the latest I disallowed rematerializing an instruction if it has a dependency on a planned remat. The main challenge with such remats is doing the actual rematerialization in correct order. Since we walk over the region to calculate RP after each remat, we may end up temporariliy producing illegal code. After #118722 , we can remat in batches before doing RP calculations, so this will not be a problem. |
Change-Id: I2131d6510e74dc1a7c58ebe8007455cb3540a0f3
@@ -1664,7 +1668,6 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat, | |||
if (LM.none()) | |||
break; | |||
} | |||
assert(LM.none()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Testing had issues with this assert -- would like to understand why -- looking into it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We may have non empty LM if our register does not have a subrange for every subregister. I've attached a reproducer of this condition (admittedly, it looks a bit funky but it represents a case in hipBlender).
If a superregister has undef subregisters, LIS won't produce a subrange for those subregisters. Thus, when we walk over the subranges for a full register use, the sum of the subrange masks will not be the full register mask.
PSDB was fine except for the I also modified the patch to allow for more rematerializations (e.g. not only for high pressure regions, always do the remats even if it doesn't help, etc), and there were no issues with PSDB. |
Change-Id: I644283266b49b0ccfeb0c6d7b7646865e78db643
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Change-Id: I89e1100f69662c750c90383c64af1f066a117a59
Okay -- I had to fix the expensive check because of the condition that not every subreg will have a subrange. Because of this, we may run into a situation where the LiveInMask != the register mask, however we aren't adding any live regs. Slightly modified the lit test to trigger this condition. If there isn't any objection, I will land. |
4ce1f9079d4d3 [AMDGPU] Allow rematerialization of instructions with virtual register uses (#124327) made changes that require an ordered traversal of a DenseMap. Changing it to MapVector which respects insertion order.
…r uses (llvm#124327) Remove the restriction that scheduling rematerialization candidates cannot have virtual reg uses. Currently, this only allows for virtual reg uses which are already live at the rematerialization point, so bring in allUsesAvailableAt to check for this condition. Because of this condition, the uses of the remats will already be live in to the region, so the remat won't increase live-in pressure. Add an expensive check to check this condition.
4ce1f9079d4d3 [AMDGPU] Allow rematerialization of instructions with virtual register uses (llvm#124327) made changes that require an ordered traversal of a DenseMap. Changing it to MapVector which respects insertion order.
Remove the restriction that scheduling rematerialization candidates cannot have virtual reg uses.
Currently, this only allows for virtual reg uses which are already live at the rematerialization point, so bring in allUsesAvailableAt to check for this condition. Because of this condition, the uses of the remats will already be live in to the region, so the remat won't increase live-in pressure.
Add an expensive check to check this condition.