Skip to content

Commit b8cc7f4

Browse files
dstuttyuxuanchen1997
authored andcommitted
[AMDGPU] Fix indirect dst bug for non-sgpr index (#98907)
Summary: When emitting indirect dst, if the idx is not SGPR there was a bug that didn't take into account that the subregister might be different from computeIndirectRegAndOffset. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251642
1 parent fe01097 commit b8cc7f4

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4814,14 +4814,14 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
48144814
.addReg(PhiReg)
48154815
.add(*Val)
48164816
.addReg(SGPRIdxReg)
4817-
.addImm(AMDGPU::sub0);
4817+
.addImm(SubReg);
48184818
} else {
48194819
const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
48204820
TRI.getRegSizeInBits(*VecRC), 32, false);
48214821
BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
48224822
.addReg(PhiReg)
48234823
.add(*Val)
4824-
.addImm(AMDGPU::sub0);
4824+
.addImm(SubReg);
48254825
}
48264826

48274827
MI.eraseFromParent();

llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,24 @@ bb8: ; preds = %bb2
543543
ret void
544544
}
545545

546+
; GCN-LABEL: {{^}}insert_or_disj_index:
547+
; GCN: v_mov_b32_e32 v[[#VIDX:]], 0
548+
549+
; MOVREL: s_mov_b32 m0, s{{[0-9]+}}
550+
; MOVREL: v_movreld_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}}
551+
552+
; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST)
553+
; IDXMODE: v_mov_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}}
554+
; IDXMODE: s_set_gpr_idx_off
555+
define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) {
556+
entry:
557+
%idx = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %A, i32 0, i32 0)
558+
%off = or disjoint i32 %idx, 1
559+
%v = insertelement <16 x i32> zeroinitializer, i32 %val, i32 %off
560+
store <16 x i32> %v, ptr addrspace(1) %out
561+
ret void
562+
}
563+
546564
declare i32 @llvm.amdgcn.workitem.id.x() #1
547565
declare void @llvm.amdgcn.s.barrier() #2
548566

0 commit comments

Comments
 (0)