Skip to content

Commit bbc7178

Browse files
pravinjagtaparsenm
authored andcommitted
AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32*
For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <[email protected]>
1 parent b26cbd2 commit bbc7178

5 files changed

+353
-257
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
10881088
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
10891089
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
10901090
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
1091-
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
1092-
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
1093-
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
1091+
let Constraints = "@earlyclobber $vdst" in {
1092+
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
1093+
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
1094+
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
1095+
}
10941096
}
10951097
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
10961098
defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
@@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
11031105
}
11041106
}
11051107

1106-
let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
1108+
let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
11071109
defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
11081110
defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
11091111
defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
@@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
11121114
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
11131115
}
11141116

1115-
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
1117+
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
11161118
defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
11171119
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
11181120
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll

Lines changed: 154 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
864864
}
865865

866866
define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
867-
; GCN-LABEL: test_cvt_scale_pk32_f32_fp6:
868-
; GCN: ; %bb.0:
869-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870-
; GCN-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6
871-
; GCN-NEXT: s_setpc_b64 s[30:31]
867+
; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
868+
; GFX950-SDAG: ; %bb.0:
869+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
871+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
872+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
873+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
874+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
875+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
876+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
877+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
878+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
879+
;
880+
; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
881+
; GFX950-GISEL: ; %bb.0:
882+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
884+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
885+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
886+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
887+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
888+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
889+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
890+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
891+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
872892
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
873893
ret <32 x float> %ret
874894
}
875895

876896
define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
877-
; GCN-LABEL: test_cvt_scale_pk32_f32_bf6:
878-
; GCN: ; %bb.0:
879-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
880-
; GCN-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6
881-
; GCN-NEXT: s_setpc_b64 s[30:31]
897+
; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
898+
; GFX950-SDAG: ; %bb.0:
899+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
901+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
902+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
903+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
904+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
905+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
906+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
907+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
908+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
909+
;
910+
; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
911+
; GFX950-GISEL: ; %bb.0:
912+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
914+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
915+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
916+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
917+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
918+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
919+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
920+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
921+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
882922
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
883923
ret <32 x float> %ret
884924
}
885925

886926
define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
887-
; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
888-
; GCN: ; %bb.0:
889-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
890-
; GCN-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
891-
; GCN-NEXT: s_setpc_b64 s[30:31]
927+
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
928+
; GFX950-SDAG: ; %bb.0:
929+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
930+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
931+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
932+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
933+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
934+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
935+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
936+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
937+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
938+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
939+
;
940+
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
941+
; GFX950-GISEL: ; %bb.0:
942+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
944+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
945+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
946+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
947+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
948+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
949+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
950+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
951+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
892952
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
893953
ret <32 x half> %ret
894954
}
@@ -897,26 +957,26 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
897957
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
898958
; GFX950-SDAG: ; %bb.0:
899959
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
901-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
902-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
903-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
904-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
905-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
960+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
961+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
962+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
963+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
964+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
965+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
906966
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
907-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0
967+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
908968
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
909969
;
910970
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
911971
; GFX950-GISEL: ; %bb.0:
912972
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913973
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
914974
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
915-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
916-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
917-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
918-
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
919-
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
975+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
976+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
977+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
978+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
979+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
920980
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
921981
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
922982
ret <32 x half> %ret
@@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %
926986
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
927987
; GFX950-SDAG: ; %bb.0:
928988
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
929-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6
989+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
990+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
991+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
992+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
993+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
994+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
995+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
996+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
930997
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
931998
;
932999
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
@@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
9581025
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
9591026
; GFX950-SDAG: ; %bb.0:
9601027
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
961-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
962-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
963-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
964-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
965-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
966-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
1028+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1029+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1030+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1031+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1032+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
1033+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
9671034
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
968-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0
1035+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
9691036
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
9701037
;
9711038
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
@@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
10001067
}
10011068

10021069
define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
1003-
; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1004-
; GCN: ; %bb.0:
1005-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006-
; GCN-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
1007-
; GCN-NEXT: s_setpc_b64 s[30:31]
1070+
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1071+
; GFX950-SDAG: ; %bb.0:
1072+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1073+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
1074+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
1075+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
1076+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
1077+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
1078+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
1079+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
1080+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1081+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
1082+
;
1083+
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1084+
; GFX950-GISEL: ; %bb.0:
1085+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
1087+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
1088+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
1089+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
1090+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
1091+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
1092+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
1093+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1094+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
10081095
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
10091096
ret <32 x half> %ret
10101097
}
@@ -1013,26 +1100,26 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
10131100
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
10141101
; GFX950-SDAG: ; %bb.0:
10151102
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
1017-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
1018-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
1019-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
1020-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
1021-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
1103+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1104+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1105+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1106+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1107+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
1108+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
10221109
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
1023-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0
1110+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
10241111
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
10251112
;
10261113
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
10271114
; GFX950-GISEL: ; %bb.0:
10281115
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10291116
; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
10301117
; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
1031-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1032-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1033-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
1034-
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
1035-
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
1118+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
1119+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
1120+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
1121+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
1122+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
10361123
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
10371124
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
10381125
ret <32 x half> %ret
@@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %
10421129
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
10431130
; GFX950-SDAG: ; %bb.0:
10441131
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6
1132+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
1133+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
1134+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
1135+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
1136+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
1137+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
1138+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
1139+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
10461140
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
10471141
;
10481142
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
@@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
10741168
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
10751169
; GFX950-SDAG: ; %bb.0:
10761170
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
1078-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
1079-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
1080-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
1081-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16
1082-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17
1171+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1172+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1173+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1174+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1175+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
1176+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
10831177
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
1084-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0
1178+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
10851179
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
10861180
;
10871181
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:

0 commit comments

Comments
 (0)