Skip to content

Commit 09d01be

Browse files
authored
[AMDGPU][True16][CodeGen] replace subreg_to_reg to req_sequence (#138746)
Since subreg_to_reg is considered broken in llvm, replace subreg_to_reg to reg_sequence
1 parent a0260a9 commit 09d01be

9 files changed

+1072
-1050
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7777,8 +7777,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77777777
return;
77787778
}
77797779

7780-
// If this is a v2s copy src from vgpr16 to sgpr32,
7781-
// replace vgpr copy to subreg_to_reg
7780+
// If this is a v2s copy src from 16bit to 32bit,
7781+
// replace vgpr copy to reg_sequence
77827782
// This can be remove after we have sgpr16 in place
77837783
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
77847784
Inst.getOperand(1).getReg().isVirtual() &&
@@ -7787,11 +7787,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77877787
if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
77887788
32 == RI.getRegSizeInBits(*NewDstRC)) {
77897789
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7790+
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
77907791
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7791-
get(TargetOpcode::SUBREG_TO_REG), NewDstReg)
7792-
.add(MachineOperand::CreateImm(0))
7793-
.add(Inst.getOperand(1))
7794-
.add(MachineOperand::CreateImm(AMDGPU::lo16));
7792+
get(AMDGPU::IMPLICIT_DEF), Undef);
7793+
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7794+
get(AMDGPU::REG_SEQUENCE), NewDstReg)
7795+
.addReg(Inst.getOperand(1).getReg())
7796+
.addImm(AMDGPU::lo16)
7797+
.addReg(Undef)
7798+
.addImm(AMDGPU::hi16);
77957799
Inst.eraseFromParent();
77967800

77977801
MRI.replaceRegWith(DstReg, NewDstReg);

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 519 additions & 524 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 62 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -9418,78 +9418,80 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
94189418
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
94199419
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
94209420
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9421-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9421+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
94229422
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
9423-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
9424-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
94259423
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
9424+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
94269425
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
9427-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
9428-
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
9429-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
9430-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9431-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
9432-
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
9433-
; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
9434-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9426+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
9427+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
94359428
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
94369429
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
9437-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
9438-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
9439-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9430+
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
9431+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9432+
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1
9433+
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2
9434+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9435+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9436+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
9437+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
9438+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9439+
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
9440+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
9441+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
9442+
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
94409443
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
94419444
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9442-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
9443-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
9444-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
9445+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
9446+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
9447+
; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
9448+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9449+
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
9450+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
9451+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9452+
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
9453+
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
9454+
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
9455+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
9456+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
94459457
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
9446-
; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v2
9447-
; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
9448-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
9449-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
9450-
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
9451-
; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v3, 16, 1
9452-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v3
9453-
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
9454-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9455-
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
9456-
; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v3, 0x7fff
9457-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9458-
; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_cndmask_b32 v1, v7, v9
9459-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9460-
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff
9458+
; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
9459+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
9460+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
9461+
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
94619462
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
9462-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
9463-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
9464-
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
9465-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
9466-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
9463+
; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
9464+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
94679465
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
9468-
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
9469-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
9470-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9471-
; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
9472-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
9473-
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
9474-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v5
9475-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9476-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v4
9477-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v13, vcc_lo
9466+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9467+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
9468+
; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff
9469+
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v8, 0x7fff
9470+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
94789471
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9479-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
9480-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
9481-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v14, vcc_lo
9472+
; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff
9473+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
9474+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
9475+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo
94829476
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9483-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
9484-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v12, v15, vcc_lo
9485-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
9486-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9487-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
9488-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
9489-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v7
9490-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9491-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
9492-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v6
9477+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo
9478+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9479+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
9480+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
9481+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo
9482+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9483+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v8
9484+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo
9485+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
9486+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
9487+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
9488+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3
9489+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h
9490+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9491+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2
9492+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4
9493+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
9494+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5
94939495
; GFX11-TRUE16-NEXT: .LBB47_2: ; %end
94949496
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
94959497
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,11 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
121121
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
122122
; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
123123
; GCN-NEXT: s_cbranch_execz .LBB1_2
124-
; GCN-NEXT: ; %bb.1:
124+
; GCN-NEXT: ; %bb.1: ; %cmp.true
125125
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
126126
; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0
127127
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
128-
; GCN-NEXT: .LBB1_2:
128+
; GCN-NEXT: .LBB1_2: ; %end
129129
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
130130
; GCN-NEXT: s_setpc_b64 s[30:31]
131131
;

0 commit comments

Comments
 (0)