[AMDGPU] Add target hook to isGlobalMemoryObject #112781

kerbowa · 2024-10-17T21:45:56Z

We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.

llvmbot · 2024-10-17T21:46:31Z

@llvm/pr-subscribers-backend-amdgpu

Author: Austin Kerbow (kerbowa)

Changes

We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.

Patch is 20.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112781.diff

8 Files Affected:

(modified) llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h (+4)
(modified) llvm/lib/CodeGen/ScheduleDAGInstrs.cpp (+1-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (-21)
(modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+26)
(modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.h (+4)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+35)
(modified) llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir (+11-11)

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 822b06f080fa64..d9737ff007a85b 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -374,6 +374,10 @@ namespace llvm {
     void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
     void addVRegUseDeps(SUnit *SU, unsigned OperIdx);
 
+    /// Returns true if MI is an instruction we are unable to reason about
+    /// (like a call or something with unmodeled side effects).
+    virtual bool isGlobalMemoryObject(MachineInstr *MI);
+
     /// Returns a mask for which lanes get read/written by the given (register)
     /// machine operand.
     LaneBitmask getLaneMaskForMO(const MachineOperand &MO) const;
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index a0632eb17e65e6..a86e120d689bd3 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -549,7 +549,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 
 /// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(MachineInstr *MI) {
+bool ScheduleDAGInstrs::isGlobalMemoryObject(MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
          (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index a367db70627748..f58cad58e33edc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -240,23 +240,6 @@ class SchedGroup {
   }
 };
 
-// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
-static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
-  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
-         SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
-         SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
-
-  while (!SU.Preds.empty())
-    for (auto &P : SU.Preds)
-      SU.removePred(P);
-
-  while (!SU.Succs.empty())
-    for (auto &S : SU.Succs)
-      for (auto &SP : S.getSUnit()->Preds)
-        if (SP.getSUnit() == &SU)
-          S.getSUnit()->removePred(SP);
-}
-
 using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
 using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
 
@@ -460,7 +443,6 @@ void PipelineSolver::makePipeline() {
       // Command line requested IGroupLP doesn't have SGBarr
       if (!SGBarr)
         continue;
-      resetEdges(*SGBarr, DAG);
       SG.link(*SGBarr, false);
     }
   }
@@ -2567,7 +2549,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
       initSchedGroupBarrierPipelineStage(R);
       FoundSB = true;
     } else if (Opc == AMDGPU::IGLP_OPT) {
-      resetEdges(*R, DAG);
       if (!FoundSB && !FoundIGLP) {
         FoundIGLP = true;
         ShouldApplyIGLP = initIGLPOpt(*R);
@@ -2589,7 +2570,6 @@ void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
   // Remove all existing edges from the SCHED_BARRIER that were added due to the
   // instruction having side effects.
-  resetEdges(SchedBarrier, DAG);
   LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
                     << MI.getOperand(0).getImm() << "\n");
   auto InvertedMask =
@@ -2647,7 +2627,6 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
     std::vector<SUnit>::reverse_iterator RIter) {
   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
   // to the instruction having side effects.
-  resetEdges(*RIter, DAG);
   MachineInstr &SGB = *RIter->getInstr();
   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
   int32_t SGMask = SGB.getOperand(0).getImm();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..f7c4eb56c92af3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -27,6 +27,7 @@
 #include "AMDGPUIGroupLP.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 
 #define DEBUG_TYPE "machine-scheduler"
 
@@ -1748,6 +1749,17 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
   }
 }
 
+static bool isIGLPInstr(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::IGLP_OPT:
+  case AMDGPU::SCHED_BARRIER:
+  case AMDGPU::SCHED_GROUP_BARRIER:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
   return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
     unsigned Opc = MI->getOpcode();
@@ -1755,11 +1767,25 @@ static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
   });
 }
 
+bool GCNScheduleDAGMILive::isGlobalMemoryObject(MachineInstr *MI) {
+  if (isIGLPInstr(MI))
+    return false;
+
+  return ScheduleDAGInstrs::isGlobalMemoryObject(MI);
+}
+
 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
     bool RemoveKillFlags)
     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
 
+bool GCNPostScheduleDAGMILive::isGlobalMemoryObject(MachineInstr *MI) {
+  if (isIGLPInstr(MI))
+    return false;
+
+  return ScheduleDAGInstrs::isGlobalMemoryObject(MI);
+}
+
 void GCNPostScheduleDAGMILive::schedule() {
   HasIGLPInstrs = hasIGLPInstrs(this);
   if (HasIGLPInstrs) {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..98d7f46ee79ef6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -285,6 +285,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
 
   std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
 
+  bool isGlobalMemoryObject(MachineInstr *MI) override;
+
 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
                        std::unique_ptr<MachineSchedStrategy> S);
@@ -469,6 +471,8 @@ class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
 
   bool HasIGLPInstrs = false;
 
+  bool isGlobalMemoryObject(MachineInstr *MI) override;
+
 public:
   void schedule() override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0473e017f193cb..2c5773f5a34809 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -24,9 +24,6 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
   ; GCN-NEXT:    ; implicit-def: $vgpr91
-  ; GCN-NEXT:    ;;#ASMSTART
-  ; GCN-NEXT:    s_waitcnt vmcnt(8)
-  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
@@ -476,6 +473,9 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    v_mov_b32_e32 v4, 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 3168e05b816bee..1eb918cd198bf3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -285,6 +285,41 @@ entry:
   ret void
 }
 
+define amdgpu_kernel void @test_iglp_opt_asm_sideeffect(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_iglp_opt_asm_sideeffect:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffc, v0
+; GCN-NEXT:    ; iglp_opt mask(0x00000000)
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_u32_e32 v1, s0, v0
+; GCN-NEXT:    ds_read_b32 v1, v1
+; GCN-NEXT:    v_add_u32_e32 v0, s1, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_write_b32 v0, v1
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    ds_read_b32 v0, v2 offset:256
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_write_b32 v1, v0 offset:256
+; GCN-NEXT:    s_endpgm
+entry:
+  %idx = call i32 @llvm.amdgcn.workitem.id.x()
+  %load.0.addr = getelementptr float, ptr addrspace(3) %in, i32 %idx
+  %load.0 = load float, ptr addrspace(3) %load.0.addr
+  %store.0.addr = getelementptr float, ptr addrspace(3) %out, i32 %idx
+  store float %load.0, ptr addrspace(3) %store.0.addr
+  call void asm sideeffect "", ""() #1
+  call void @llvm.amdgcn.iglp.opt(i32 0) #1
+  %load.1.addr = getelementptr float, ptr addrspace(3) %in, i32 64
+  %load.1 = load float, ptr addrspace(3) %load.1.addr
+  %store.1.addr = getelementptr float, ptr addrspace(3) %out, i32 64
+  store float %load.1, ptr addrspace(3) %store.1.addr
+  ret void
+}
 
 declare void @llvm.amdgcn.iglp.opt(i32) #1
 declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
index bdfc8227fdccb1..7295506213c4b2 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-pre-RA.mir
@@ -96,10 +96,10 @@ body: |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: SCHED_BARRIER 1
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     %0:sreg_64_xexec_xnull = IMPLICIT_DEF
@@ -163,19 +163,19 @@ body: |
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_1:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_2:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_2:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_3:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_4:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[V_MFMA_F32_4X4X1F32_e64_3]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: SCHED_BARRIER 4
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_3:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
-    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_3]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MUL_LO_U32_e64_1]], implicit [[V_MUL_LO_U32_e64_2]], implicit [[V_MFMA_F32_4X4X1F32_e64_4]]
     %0:sreg_64_xexec_xnull = IMPLICIT_DEF
@@ -258,10 +258,10 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: SCHED_BARRIER 16
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
@@ -290,10 +290,10 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: SCHED_BARRIER 32
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR1]], [[GLOBAL_LOAD_DWORD_SADDR1]], implicit $exec
     ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF1]], [[V_MUL_LO_U32_e64_1]], [[DEF]], 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
@@ -354,9 +354,9 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
-    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: SCHED_BARRIER 128
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
     ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -386,9 +386,9 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_]], [[DS_READ_U16_gfx9_]], implicit $exec
-    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: DS_WRITE_B32 [[V_MUL_LO_U32_e64_]], [[DEF]], 0, 16, implicit $m0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 3)
     ; CHECK-NEXT: S_NOP 0
+    ; CHECK-NEXT: [[DS_READ_U16_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[DEF]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 3)
     ; CHECK-NEXT: SCHED_BARRIER 256
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[DS_READ_U16_gfx9_1]], [[DS_READ_U16_gfx9_1]], implicit $exec
     ; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
@@ -453,7 +453,6 @@ body: |
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
     ; CHECK-NEXT: [[DEF2:%[0-9]+]]:areg_128 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_NOP 0
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = nsw V_MUL_LO_U32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF1]], implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[DEF1]], [[GLOBAL_LOAD_DWORD_SADDR]], [[DEF2]], 0, 0, 0, implicit $mode, implicit $exec...
[truncated]

kerbowa · 2024-10-17T21:46:45Z

A previous version of this patch was posted a while ago on phabricator. We need to address this bug now since there are confirmed cases where it is impacting correctness.

I've experimented with about 4 different patches to solve this. After doing that work, I've just come full circle back to this patch. It is the best, and most correct way to solve the problem. I'll explain why below.

For background, we want to have custom DAG building with IGLP instructions so preferably the generic DAG builder would not add any edges for these instructions, and we can then add custom edges in our IGLP mutation.

The IGLP instructions do have unmolded side effects, but our backend can reason about them and relax the edges that would normally be added. To do that this patch introduces a target hook to the DAG builder to decide whether or not to add edges for side effect producing instructions.

In that old review Matt suggested an alternative approach to add a pseudo memory operand to these instructions that doesn't alias anything. This approach doesn't work since it leads to the exact same problem as before. Even if an instruction doesn't alias anything there will still be valid edges added between other instructions with unmodeled side effects. We end up with the same bug but it will be even more of an edge case.

Another approach is to remove all of these instructions from the block before invoking the scheduler and DAG builder, record information about them, and apply the DAG mutations accordingly, then finally re-add the instructions after scheduling. This is needlessly expensive and hacky.

A final approach is to modify the instruction description of these instructions to not have side effects in our backend before scheduling. An obvious hack.

The best approach is instead to admit that these instructions do have difficult to model side effects, however our backend can reason about them so we want to modify the DAG builder accordingly. In my opinion, this hook is the best way to do that.

arsenm · 2024-10-18T05:14:01Z

llvm/lib/CodeGen/ScheduleDAGInstrs.cpp

@@ -549,7 +549,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {

 /// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(MachineInstr *MI) {


What's the difference between this and TII::isSchedulingBoundary?

Also wondering if we could treat these like isPosition

What's the difference between this and TII::isSchedulingBoundary?

isSchedulingBoundary creates divisions between scheduling regions, it separates and defines independent ScheduleDAGs. isGlobalMemoryObject controls dependency edges within a ScheduleDAG.

Also wondering if we could treat these like isPosition

In what way? You mean treat these as debug info instead of instructions?

I mean make it an instruction property, rather than introducing a very specific override for this one case. Are you allowed to put other isLabel or isPosition instructions anywhere in the block?

I don't know if an existing instruction property would work, isPosition makes an instruction a scheduling boundary and would split the block as far as the scheduler is concerned.

jrbyrnes · 2024-12-03T17:40:07Z

The changes to IGLP / GCNSched look fine.

FWIW, I'm okay with the approach (though I think Matt has an outstanding concern).

I guess another alternative is to manually add the true edges between the preds and succs of the iglpInst during resetEdges. Perhaps this could be optimized a bit, but it seems needlessly expensive. However, maybe this could suffice until we have a better way to express the side effects these should and shouldn't have?

kerbowa · 2024-12-03T20:00:56Z

The changes to IGLP / GCNSched look fine.

FWIW, I'm okay with the approach (though I think Matt has an outstanding concern).

I guess another alternative is to manually add the true edges between the preds and succs of the iglpInst during resetEdges. Perhaps this could be optimized a bit, but it seems needlessly expensive. However, maybe this could suffice until we have a better way to express the side effects these should and shouldn't have?

I don't know how we would manually add the correct edges in resetEdges without just redoing a bunch of the login in the DAG builder? So I'm not sure that would work.

kerbowa · 2024-12-12T16:04:16Z

ping

kerbowa · 2024-12-18T18:34:46Z

@arsenm Do you have any major objection to this going in? This is needed to fix a few outstanding issues and some people are already using it downstream.

kerbowa · 2024-12-30T05:18:10Z

ping

bcahoon

Since this fixes a serious issue with IGLP correctness, I think it should be merged until we come up with an appropriate property to describe these instructions (that doesn't require the isIGLPInstr function). That's just my opinion.

bcahoon · 2025-01-03T14:42:34Z

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

+    return false;
+  }
+}
+
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
  return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
    unsigned Opc = MI->getOpcode();
    return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;


Why is SCHED_BARRIER left out of hasIGLPInstrs(), but is included in isIGLPInstr?

github-actions · 2025-01-07T05:22:41Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 986f2ac48f369bc025a3f1830e2d5bba235be0fd f218674c20373b111334d8692813bcc1d5cc9ee7 --extensions h,cpp -- llvm/include/llvm/CodeGen/TargetInstrInfo.h llvm/lib/CodeGen/ScheduleDAGInstrs.cpp llvm/lib/CodeGen/TargetInstrInfo.cpp llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.h

View the diff from clang-format here.

diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 8e3e06bf57..308b22087b 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -548,7 +548,6 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
                                             unsigned Latency) {
   if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {

We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.

…GLP instr utility functions.

jrbyrnes

The latest changes to IGLP / GCNSched still look fine. I prefer the TII hook over the DAG hook. Maybe wait a little while to see if there are any serious objections with the latest approach.
Formatting.

We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.

arsenm · 2025-01-22T05:51:11Z

llvm/include/llvm/CodeGen/TargetInstrInfo.h

@@ -136,6 +136,10 @@ class TargetInstrInfo : public MCInstrInfo {
                                         const TargetRegisterInfo *TRI,
                                         const MachineFunction &MF) const;

+  /// Returns true if MI is an instruction we are unable to reason about
+  /// (like a call or something with unmodeled side effects).
+  virtual bool isGlobalMemoryObject(const MachineInstr *MI) const;


This should follow the pattern that isCopyInstr uses. Have the virtual overridable Impl function, with the generic cases handled directly

We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this. Fixes: SWDEV-447200 (cherry picked from commit 657fb44)

kerbowa requested review from arsenm and jrbyrnes October 17, 2024 21:45

llvmbot added the backend:AMDGPU label Oct 17, 2024

arsenm added the llvm:codegen label Oct 18, 2024

arsenm reviewed Oct 18, 2024

View reviewed changes

kerbowa requested a review from arsenm December 2, 2024 03:30

kerbowa requested a review from bcahoon January 2, 2025 18:10

bcahoon approved these changes Jan 3, 2025

View reviewed changes

kerbowa force-pushed the add-target-hook-to-isglobalmemoryobject branch from 92ae8f9 to 37416aa Compare January 7, 2025 05:19

kerbowa added 2 commits January 10, 2025 09:23

[AMDGPU] Add target hook to isGlobalMemoryObject

673eb8c

We want special handing for IGLP instructions in the scheduler but they should still be treated like they have side effects by other passes. Add a target hook to the ScheduleDAGInstrs DAG builder so that we have more control over this.

Based on offline discussion moved function to TII. Fix ambiguity in I…

f218674

…GLP instr utility functions.

jrbyrnes approved these changes Jan 10, 2025

View reviewed changes

kerbowa force-pushed the add-target-hook-to-isglobalmemoryobject branch from 37416aa to f218674 Compare January 10, 2025 19:06

kerbowa merged commit 657fb44 into llvm:main Jan 11, 2025
7 of 8 checks passed

arsenm reviewed Jan 22, 2025

View reviewed changes

[AMDGPU] Add target hook to isGlobalMemoryObject #112781

[AMDGPU] Add target hook to isGlobalMemoryObject #112781

Uh oh!

Conversation

kerbowa commented Oct 17, 2024

Uh oh!

llvmbot commented Oct 17, 2024

Uh oh!

kerbowa commented Oct 17, 2024

Uh oh!

arsenm Oct 18, 2024

Choose a reason for hiding this comment

Uh oh!

kerbowa Nov 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm Dec 3, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

kerbowa Dec 3, 2024

Choose a reason for hiding this comment

Uh oh!

jrbyrnes commented Dec 3, 2024

Uh oh!

kerbowa commented Dec 3, 2024

Uh oh!

kerbowa commented Dec 12, 2024

Uh oh!

kerbowa commented Dec 18, 2024

Uh oh!

kerbowa commented Dec 30, 2024

Uh oh!

bcahoon left a comment

Choose a reason for hiding this comment

Uh oh!

bcahoon Jan 3, 2025

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Jan 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jrbyrnes left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm Jan 22, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

kerbowa Nov 10, 2024 •

edited

Loading

arsenm Dec 3, 2024 •

edited

Loading

github-actions bot commented Jan 7, 2025 •

edited

Loading

jrbyrnes left a comment •

edited

Loading