Skip to content

Commit e86b4a6

Browse files
arsenmpravinjagtap
andcommitted
AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32* (llvm#117822)
For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <[email protected]>
1 parent 45b75ca commit e86b4a6

5 files changed

+353
-257
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,9 +1091,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
10911091
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
10921092
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
10931093
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
1094-
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
1095-
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
1096-
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
1094+
let Constraints = "@earlyclobber $vdst" in {
1095+
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
1096+
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
1097+
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
1098+
}
10971099
}
10981100
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
10991101
defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
@@ -1106,7 +1108,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
11061108
}
11071109
}
11081110

1109-
let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
1111+
let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
11101112
defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
11111113
defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
11121114
defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
@@ -1115,7 +1117,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
11151117
defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
11161118
}
11171119

1118-
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
1120+
let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
11191121
defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
11201122
defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
11211123
defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll

Lines changed: 154 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
864864
}
865865

866866
define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
867-
; GCN-LABEL: test_cvt_scale_pk32_f32_fp6:
868-
; GCN: ; %bb.0:
869-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870-
; GCN-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6
871-
; GCN-NEXT: s_setpc_b64 s[30:31]
867+
; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
868+
; GFX950-SDAG: ; %bb.0:
869+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
871+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
872+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
873+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
874+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
875+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
876+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
877+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
878+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
879+
;
880+
; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
881+
; GFX950-GISEL: ; %bb.0:
882+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
884+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
885+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
886+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
887+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
888+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
889+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
890+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
891+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
872892
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
873893
ret <32 x float> %ret
874894
}
875895

876896
define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
877-
; GCN-LABEL: test_cvt_scale_pk32_f32_bf6:
878-
; GCN: ; %bb.0:
879-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
880-
; GCN-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6
881-
; GCN-NEXT: s_setpc_b64 s[30:31]
897+
; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
898+
; GFX950-SDAG: ; %bb.0:
899+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
901+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
902+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
903+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
904+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
905+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
906+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
907+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
908+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
909+
;
910+
; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
911+
; GFX950-GISEL: ; %bb.0:
912+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
914+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
915+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
916+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
917+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
918+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
919+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
920+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
921+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
882922
%ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
883923
ret <32 x float> %ret
884924
}
885925

886926
define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
887-
; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
888-
; GCN: ; %bb.0:
889-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
890-
; GCN-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
891-
; GCN-NEXT: s_setpc_b64 s[30:31]
927+
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
928+
; GFX950-SDAG: ; %bb.0:
929+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
930+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
931+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
932+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
933+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
934+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
935+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
936+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
937+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
938+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
939+
;
940+
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
941+
; GFX950-GISEL: ; %bb.0:
942+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
944+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
945+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
946+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
947+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
948+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
949+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
950+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
951+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
892952
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
893953
ret <32 x half> %ret
894954
}
@@ -897,24 +957,24 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
897957
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
898958
; GFX950-SDAG: ; %bb.0:
899959
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
901-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
902-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
903-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
904-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s4
905-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s5
960+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
961+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
962+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
963+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
964+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s4
965+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s5
906966
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
907-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0
967+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
908968
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
909969
;
910970
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
911971
; GFX950-GISEL: ; %bb.0:
912972
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
914-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
915-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
916-
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
917-
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
973+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
974+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
975+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
976+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
977+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
918978
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
919979
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
920980
ret <32 x half> %ret
@@ -924,7 +984,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %
924984
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
925985
; GFX950-SDAG: ; %bb.0:
926986
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6
987+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
988+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
989+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
990+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
991+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
992+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
993+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
994+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
928995
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
929996
;
930997
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
@@ -956,14 +1023,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
9561023
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
9571024
; GFX950-SDAG: ; %bb.0:
9581025
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
959-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
960-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
961-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
962-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
963-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s4
964-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s5
1026+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1027+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1028+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1029+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1030+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s4
1031+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s5
9651032
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
966-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0
1033+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
9671034
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
9681035
;
9691036
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
@@ -996,11 +1063,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
9961063
}
9971064

9981065
define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
999-
; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1000-
; GCN: ; %bb.0:
1001-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002-
; GCN-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
1003-
; GCN-NEXT: s_setpc_b64 s[30:31]
1066+
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1067+
; GFX950-SDAG: ; %bb.0:
1068+
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1069+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
1070+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
1071+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
1072+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
1073+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
1074+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
1075+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
1076+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1077+
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
1078+
;
1079+
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1080+
; GFX950-GISEL: ; %bb.0:
1081+
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1082+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
1083+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
1084+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
1085+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
1086+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
1087+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
1088+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
1089+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1090+
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
10041091
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
10051092
ret <32 x half> %ret
10061093
}
@@ -1009,24 +1096,24 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
10091096
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
10101097
; GFX950-SDAG: ; %bb.0:
10111098
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1012-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
1013-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
1014-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
1015-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
1016-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s4
1017-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s5
1099+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1100+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1101+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1102+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1103+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s4
1104+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s5
10181105
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
1019-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0
1106+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
10201107
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
10211108
;
10221109
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
10231110
; GFX950-GISEL: ; %bb.0:
10241111
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1025-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1026-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1027-
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
1028-
; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000
1029-
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
1112+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
1113+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
1114+
; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
1115+
; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
1116+
; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
10301117
; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
10311118
%ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
10321119
ret <32 x half> %ret
@@ -1036,7 +1123,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %
10361123
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
10371124
; GFX950-SDAG: ; %bb.0:
10381125
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1039-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6
1126+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
1127+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
1128+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
1129+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
1130+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
1131+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
1132+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
1133+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
10401134
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
10411135
;
10421136
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
@@ -1068,14 +1162,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
10681162
; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
10691163
; GFX950-SDAG: ; %bb.0:
10701164
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1071-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0
1072-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1
1073-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2
1074-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3
1075-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s4
1076-
; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s5
1165+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1166+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1167+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1168+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1169+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s4
1170+
; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s5
10771171
; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
1078-
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0
1172+
; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
10791173
; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
10801174
;
10811175
; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:

0 commit comments

Comments
 (0)