-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Add target hook to isGlobalMemoryObject #112781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Add target hook to isGlobalMemoryObject #112781
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Austin Kerbow (kerbowa) ChangesWe want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this. Patch is 20.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112781.diff 8 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 822b06f080fa64..d9737ff007a85b 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -374,6 +374,10 @@ namespace llvm {
void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
void addVRegUseDeps(SUnit *SU, unsigned OperIdx);
+ /// Returns true if MI is an instruction we are unable to reason about
+ /// (like a call or something with unmodeled side effects).
+ virtual bool isGlobalMemoryObject(MachineInstr *MI);
+
/// Returns a mask for which lanes get read/written by the given (register)
/// machine operand.
LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index a0632eb17e65e6..a86e120d689bd3 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -549,7 +549,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
/// Returns true if MI is an instruction we are unable to reason about
/// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(MachineInstr *MI) {
+bool ScheduleDAGInstrs::isGlobalMemoryObject(MachineInstr *MI) {
return MI->isCall() || MI->hasUnmodeledSideEffects() ||
(MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index a367db70627748..f58cad58e33edc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -240,23 +240,6 @@ class SchedGroup {
}
};
-// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
-static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
- assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
- SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
- SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
-
- while (!SU.Preds.empty())
- for (auto &P : SU.Preds)
- SU.removePred(P);
-
- while (!SU.Succs.empty())
- for (auto &S : SU.Succs)
- for (auto &SP : S.getSUnit()->Preds)
- if (SP.getSUnit() == &SU)
- S.getSUnit()->removePred(SP);
-}
-
using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
@@ -460,7 +443,6 @@ void PipelineSolver::makePipeline() {
// Command line requested IGroupLP doesn't have SGBarr
if (!SGBarr)
continue;
- resetEdges(*SGBarr, DAG);
SG.link(*SGBarr, false);
}
}
@@ -2567,7 +2549,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
initSchedGroupBarrierPipelineStage(R);
FoundSB = true;
} else if (Opc == AMDGPU::IGLP_OPT) {
- resetEdges(*R, DAG);
if (!FoundSB && !FoundIGLP) {
FoundIGLP = true;
ShouldApplyIGLP = initIGLPOpt(*R);
@@ -2589,7 +2570,6 @@ void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
// Remove all existing edges from the SCHED_BARRIER that were added due to the
// instruction having side effects.
- resetEdges(SchedBarrier, DAG);
LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
<< MI.getOperand(0).getImm() << "\n");
auto InvertedMask =
@@ -2647,7 +2627,6 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
std::vector<SUnit>::reverse_iterator RIter) {
// Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
// to the instruction having side effects.
- resetEdges(*RIter, DAG);
MachineInstr &SGB = *RIter->getInstr();
assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
int32_t SGMask = SGB.getOperand(0).getImm();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..f7c4eb56c92af3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -27,6 +27,7 @@
#include "AMDGPUIGroupLP.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -1748,6 +1749,17 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
}
}
+static bool isIGLPInstr(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AMDGPU::IGLP_OPT:
+ case AMDGPU::SCHED_BARRIER:
+ case AMDGPU::SCHED_GROUP_BARRIER:
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
unsigned Opc = MI->getOpcode();
@@ -1755,11 +1767,25 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
});
}
+bool GCNScheduleDAGMILive::isGlobalMemoryObject(MachineInstr *MI) {
+ if (isIGLPInstr(MI))
+ return false;
+
+ return ScheduleDAGInstrs::isGlobalMemoryObject(MI);
+}
+
GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
bool RemoveKillFlags)
: ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
+bool GCNPostScheduleDAGMILive::isGlobalMemoryObject(MachineInstr *MI) {
+ if (isIGLPInstr(MI))
+ return false;
+
+ return ScheduleDAGInstrs::isGlobalMemoryObject(MI);
+}
+
void GCNPostScheduleDAGMILive::schedule() {
HasIGLPInstrs = hasIGLPInstrs(this);
if (HasIGLPInstrs) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..98d7f46ee79ef6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -285,6 +285,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
+ bool isGlobalMemoryObject(MachineInstr *MI) override;
+
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S);
@@ -469,6 +471,8 @@ class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
bool HasIGLPInstrs = false;
+ bool isGlobalMemoryObject(MachineInstr *MI) override;
+
public:
void schedule() override;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0473e017f193cb..2c5773f5a34809 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -24,9 +24,6 @@
; GCN-NEXT: ; implicit-def: $vgpr79
; GCN-NEXT: ; implicit-def: $vgpr80
; GCN-NEXT: ; implicit-def: $vgpr91
- ; GCN-NEXT: ;;#ASMSTART
- ; GCN-NEXT: s_waitcnt vmcnt(8)
- ; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
; GCN-NEXT: ; iglp_opt mask(0x00000002)
; GCN-NEXT: s_nop 1
@@ -476,6 +473,9 @@
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 3168e05b816bee..1eb918cd198bf3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -285,6 +285,41 @@ entry:
ret void
}
+define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_iglp_opt_asm_sideeffect:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffc, v0
+; GCN-NEXT: ; iglp_opt mask(0x00000000)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_u32_e32 v1, s0, v0
+; GCN-NEXT: ds_read_b32 v1, v1
+; GCN-NEXT: v_add_u32_e32 v0, s1, v0
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_write_b32 v0, v1
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ds_read_b32 v0, v2 offset:256
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_write_b32 v1, v0 offset:256
+; GCN-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %load.0.addr = getelementptr float, ptr addrspace(3) %in, i32 %idx
+ %load.0 = load float, ptr addrspace(3) %load.0.addr
+ %store.0.addr = getelementptr float, ptr addrspace(3) %out, i32 %idx
+ store float %load.0, ptr addrspace(3) %store.0.addr
+ call void asm sideeffect "", ""() #1
+ call void @llvm.amdgcn.iglp.opt(i32 0) #1
+ %load.1.addr = getelementptr float, ptr addrspace(3) %in, i32 64
+ %load.1 = load float, ptr addrspace(3) %load.1.addr
+ %store.1.addr = getelementptr float, ptr addrspace(3) %out, i32 64
+ store float %load.1, ptr addrspace(3) %store.1.addr
+ ret void
+}
declare void @llvm.amdgcn.iglp.opt(i32) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
index bdfc8227fdccb1..7295506213c4b2 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
@@ -96,10 +96,10 @@ body: |
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: SCHED_BARRIER 1
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
- ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sreg_64_xexec_xnull = IMPLICIT_DEF
@@ -163,19 +163,19 @@ body: |
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
- ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+ ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: SCHED_BARRIER 4
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
- ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
%0:sreg_64_xexec_xnull = IMPLICIT_DEF
@@ -258,10 +258,10 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: SCHED_BARRIER 16
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
@@ -290,10 +290,10 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: SCHED_BARRIER 32
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
@@ -354,9 +354,9 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
- ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
; CHECK-NEXT: SCHED_BARRIER 128
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -386,9 +386,9 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
- ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
; CHECK-NEXT: S_NOP 0
+ ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
; CHECK-NEXT: SCHED_BARRIER 256
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -453,7 +453,6 @@ body: |
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
- ; CHECK-NEXT: S_NOP 0
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec...
[truncated]
|
A previous version of this patch was posted a while ago on phabricator. We need to address this bug now since there are confirmed cases where it is impacting correctness. I've experimented with about 4 different patches to solve this. After doing that work, I've just come full circle back to this patch. It is the best, and most correct way to solve the problem. I'll explain why below. For background, we want to have custom DAG building with IGLP instructions so preferably the generic DAG builder would not add any edges for these instructions, and we can then add custom edges in our IGLP mutation. The IGLP instructions do have unmolded side effects, but our backend can reason about them and relax the edges that would normally be added. To do that this patch introduces a target hook to the DAG builder to decide whether or not to add edges for side effect producing instructions. In that old review Matt suggested an alternative approach to add a pseudo memory operand to these instructions that doesn't alias anything. This approach doesn't work since it leads to the exact same problem as before. Even if an instruction doesn't alias anything there will still be valid edges added between other instructions with unmodeled side effects. We end up with the same bug but it will be even more of an edge case. Another approach is to remove all of these instructions from the block before invoking the scheduler and DAG builder, record information about them, and apply the DAG mutations accordingly, then finally re-add the instructions after scheduling. This is needlessly expensive and hacky. A final approach is to modify the instruction description of these instructions to not have side effects in our backend before scheduling. An obvious hack. The best approach is instead to admit that these instructions do have difficult to model side effects, however our backend can reason about them so we want to modify the DAG builder accordingly. In my opinion, this hook is the best way to do that. |
@@ -549,7 +549,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { | |||
|
|||
/// Returns true if MI is an instruction we are unable to reason about | |||
/// (like a call or something with unmodeled side effects). | |||
static inline bool isGlobalMemoryObject(MachineInstr *MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the difference between this and TII::isSchedulingBoundary?
Also wondering if we could treat these like isPosition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the difference between this and TII::isSchedulingBoundary?
isSchedulingBoundary creates divisions between scheduling regions, it separates and defines independent ScheduleDAGs. isGlobalMemoryObject controls dependency edges within a ScheduleDAG.
Also wondering if we could treat these like isPosition
In what way? You mean treat these as debug info instead of instructions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean make it an instruction property, rather than introducing a very specific override for this one case. Are you allowed to put other isLabel or isPosition instructions anywhere in the block?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know if an existing instruction property would work, isPosition makes an instruction a scheduling boundary and would split the block as far as the scheduler is concerned.
The changes to IGLP / GCNSched look fine. FWIW, I'm okay with the approach (though I think Matt has an outstanding concern). I guess another alternative is to manually add the true edges between the preds and succs of the iglpInst during resetEdges. Perhaps this could be optimized a bit, but it seems needlessly expensive. However, maybe this could suffice until we have a better way to express the side effects these should and shouldn't have? |
I don't know how we would manually add the correct edges in |
ping |
@arsenm Do you have any major objection to this going in? This is needed to fix a few outstanding issues and some people are already using it downstream. |
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this fixes a serious issue with IGLP correctness, I think it should be merged until we come up with an appropriate property to describe these instructions (that doesn't require the isIGLPInstr function). That's just my opinion.
return false; | ||
} | ||
} | ||
|
||
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { | ||
return any_of(*DAG, [](MachineBasicBlock::iterator MI) { | ||
unsigned Opc = MI->getOpcode(); | ||
return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is SCHED_BARRIER left out of hasIGLPInstrs(), but is included in isIGLPInstr?
92ae8f9
to
37416aa
Compare
You can test this locally with the following command:git-clang-format --diff 986f2ac48f369bc025a3f1830e2d5bba235be0fd f218674c20373b111334d8692813bcc1d5cc9ee7 --extensions h,cpp -- llvm/include/llvm/CodeGen/TargetInstrInfo.h llvm/lib/CodeGen/ScheduleDAGInstrs.cpp llvm/lib/CodeGen/TargetInstrInfo.cpp llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.h View the diff from clang-format here.diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 8e3e06bf57..308b22087b 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -548,7 +548,6 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
}
}
-
void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
unsigned Latency) {
if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
|
We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.
…GLP instr utility functions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The latest changes to IGLP / GCNSched still look fine. I prefer the TII hook over the DAG hook. Maybe wait a little while to see if there are any serious objections with the latest approach.
Formatting.
37416aa
to
f218674
Compare
We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.
@@ -136,6 +136,10 @@ class TargetInstrInfo : public MCInstrInfo { | |||
const TargetRegisterInfo *TRI, | |||
const MachineFunction &MF) const; | |||
|
|||
/// Returns true if MI is an instruction we are unable to reason about | |||
/// (like a call or something with unmodeled side effects). | |||
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should follow the pattern that isCopyInstr uses. Have the virtual overridable Impl function, with the generic cases handled directly
We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this. Fixes: SWDEV-447200 (cherry picked from commit 657fb44)
We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.