Skip to content

Commit 211db54

Browse files
committed
update patch
1 parent e8ae187 commit 211db54

File tree

2 files changed

+19
-9
lines changed

2 files changed

+19
-9
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6899,9 +6899,20 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
68996899
if (Op.getOpcode() != ISD::FP_ROUND)
69006900
return Op;
69016901

6902-
SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6903-
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6904-
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6902+
if (Subtarget->has16BitInsts()) {
6903+
if (getTargetMachine().Options.UnsafeFPMath) {
6904+
SDValue Flags = Op.getOperand(1);
6905+
SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
6906+
return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
6907+
} else {
6908+
SDValue FpToFp16 = LowerF64ToF16(Src, MVT::i16, DL, DAG);
6909+
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, FpToFp16);
6910+
}
6911+
} else {
6912+
SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6913+
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6914+
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6915+
}
69056916
}
69066917

69076918
assert(DstVT.getScalarType() == MVT::bf16 &&

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
720720
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
721721
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
722722
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
723-
; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
723+
; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
724724
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
725725
; GFX9-SDAG-NEXT: s_endpgm
726726
;
@@ -814,14 +814,13 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
814814
; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
815815
; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1
816816
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
817+
; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
817818
; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
818-
; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
819819
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
820+
; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2
820821
; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
821-
; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
822-
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
823-
; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
824-
; GFX11-SDAG-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
822+
; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
823+
; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
825824
; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
826825
; GFX11-SDAG-FAKE16-NEXT: s_endpgm
827826
;

0 commit comments

Comments
 (0)