Skip to content

[AMDGPU] Always emit SI_KILL_I1_PSEUDO for uniform floating point branches. #124028

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,9 @@ def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;

def NotHasSALUFloatInsts : Predicate<"!Subtarget->hasSALUFloatInsts()">,
AssemblerPredicate<(all_of (not FeatureSALUFloatInsts))>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we really not have none_of?

Copy link
Contributor

@jayfoad jayfoad Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We really don't. Looks like there are plenty of places where it could be used in existing .td files.


def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;

Expand Down
15 changes: 8 additions & 7 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1053,39 +1053,40 @@ def : GCNPat<
(SI_ELSE $src, $target)
>;

def : Pat <
def : GCNPat <
(int_amdgcn_kill i1:$src),
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
>;

def : Pat <
def : GCNPat <
(int_amdgcn_kill (i1 (not i1:$src))),
(SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
>;

def : Pat <
let SubtargetPredicate = NotHasSALUFloatInsts in
def : GCNPat <
(int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
(SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
>;

def : Pat <
def : GCNPat <
(int_amdgcn_wqm_demote i1:$src),
(SI_DEMOTE_I1 SCSrc_i1:$src, 0)
>;

def : Pat <
def : GCNPat <
(int_amdgcn_wqm_demote (i1 (not i1:$src))),
(SI_DEMOTE_I1 SCSrc_i1:$src, -1)
>;

// TODO: we could add more variants for other types of conditionals

def : Pat <
def : GCNPat <
(i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;

def : Pat <
def : GCNPat <
(i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))),
(COPY $src) // Return the SGPRs representing i1 src
>;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s 2>&1 | FileCheck %s

define amdgpu_ps void @_amdgpu_ps_main() {
; CHECK-LABEL: name: _amdgpu_ps_main
; CHECK: bb.0.entry:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM killed [[REG_SEQUENCE]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
; CHECK-NEXT: nofpexcept S_CMP_NLT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
; CHECK-NEXT: SI_KILL_I1_PSEUDO killed [[COPY]], 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; CHECK-NEXT: nofpexcept S_CMP_LT_F32 [[S_BUFFER_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], implicit-def $scc, implicit $mode
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.bb1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.bb2:
; CHECK-NEXT: S_ENDPGM 0
entry:
%i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
%i1 = bitcast i32 %i to float
%i2 = fcmp uge float %i1, 0.000000e+00
call void @llvm.amdgcn.kill(i1 %i2)
br i1 %i2, label %bb1, label %bb2

bb1: ; preds = %entry
%i3 = call i64 @llvm.amdgcn.s.getpc()
%i4 = and i64 %i3, 1
%i5 = inttoptr i64 %i4 to ptr addrspace(4)
%i6 = getelementptr i8, ptr addrspace(4) %i5, i64 32
br label %bb2

bb2: ; preds = %bb, %entry
ret void
}