Skip to content

Commit 14e1a4a

Browse files
committed
[AArch64][SVE] Workaround incorrect types when lowering fixed length gather/scatter
When lowering a fixed length gather/scatter the index type is assumed to be the same as the memory type, this is incorrect in cases where the extension of the index has been folded into the addressing mode. For now add a temporary workaround to fix the codegen faults caused by this by preventing the removal of this extension. At a later date the lowering for SVE gather/scatters will be redesigned to improve the way addressing modes are handled. As a short term side effect of this change, the addressing modes generated for fixed length gather/scatters will not be optimal. Differential Revision: https://reviews.llvm.org/D109145
1 parent 3fd27ec commit 14e1a4a

File tree

3 files changed

+115
-77
lines changed

3 files changed

+115
-77
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4222,7 +4222,8 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
42224222

42234223
bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
42244224
if (VT.getVectorElementType() == MVT::i32 &&
4225-
VT.getVectorElementCount().getKnownMinValue() >= 4)
4225+
VT.getVectorElementCount().getKnownMinValue() >= 4 &&
4226+
!VT.isFixedLengthVector())
42264227
return true;
42274228

42284229
return false;

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll

Lines changed: 59 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -917,19 +917,22 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
917917
; The above tests test the types, the below tests check that the addressing
918918
; modes still function
919919

920+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
920921
define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
921922
; CHECK-LABEL: masked_gather_32b_scaled_sext_f16:
922923
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
923924
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
924-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
925-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
925+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
926+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
926927
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
927928
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
928-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
929-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
930-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1]
931-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
932-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
929+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
930+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
931+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
932+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #1]
933+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
934+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
935+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
933936
; VBITS_GE_2048-NEXT: ret
934937
%cvals = load <32 x half>, <32 x half>* %a
935938
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -941,14 +944,20 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
941944
ret void
942945
}
943946

947+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
944948
define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
945949
; CHECK-LABEL: masked_gather_32b_scaled_sext_f32:
946-
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32
947-
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0]
948-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1]
949-
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0
950-
; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #2]
951-
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
950+
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
951+
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
952+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
953+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
954+
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
955+
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1
956+
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s
957+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0
958+
; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #2]
959+
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
960+
; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
952961
; VBITS_GE_2048-NEXT: ret
953962
%cvals = load <32 x float>, <32 x float>* %a
954963
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -960,15 +969,15 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b,
960969
ret void
961970
}
962971

972+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
963973
define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
964974
; CHECK-LABEL: masked_gather_32b_scaled_sext_f64:
965-
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
966-
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
967-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
968-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
969-
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
970-
; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, sxtw #3]
971-
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
975+
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
976+
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG]]/z, [x0]
977+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1]
978+
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG]]/z, [[VALS]].d, #0.0
979+
; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #3]
980+
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
972981
; VBITS_GE_2048-NEXT: ret
973982
%cvals = load <32 x double>, <32 x double>* %a
974983
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -980,19 +989,22 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b
980989
ret void
981990
}
982991

992+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
983993
define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
984994
; CHECK-LABEL: masked_gather_32b_scaled_zext:
985995
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
986996
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
987-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
988-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
997+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
998+
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
989999
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
9901000
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
991-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
992-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
993-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1]
994-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
995-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1001+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1002+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1003+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
1004+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #1]
1005+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1006+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1007+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
9961008
; VBITS_GE_2048-NEXT: ret
9971009
%cvals = load <32 x half>, <32 x half>* %a
9981010
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -1004,19 +1016,22 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
10041016
ret void
10051017
}
10061018

1019+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
10071020
define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
10081021
; CHECK-LABEL: masked_gather_32b_unscaled_sext:
10091022
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
10101023
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
1011-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
1012-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
1024+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
1025+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
10131026
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
10141027
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
1015-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
1016-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
1017-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw]
1018-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
1019-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1028+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1029+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1030+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
1031+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d]
1032+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1033+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1034+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
10201035
; VBITS_GE_2048-NEXT: ret
10211036
%cvals = load <32 x half>, <32 x half>* %a
10221037
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -1029,19 +1044,22 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
10291044
ret void
10301045
}
10311046

1047+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
10321048
define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
10331049
; CHECK-LABEL: masked_gather_32b_unscaled_zext:
10341050
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
10351051
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
1036-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
1037-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
1052+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
1053+
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
10381054
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
10391055
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
1040-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
1041-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
1042-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw]
1043-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
1044-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1056+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1057+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1058+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
1059+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d]
1060+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1061+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1062+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
10451063
; VBITS_GE_2048-NEXT: ret
10461064
%cvals = load <32 x half>, <32 x half>* %a
10471065
%idxs = load <32 x i32>, <32 x i32>* %b

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll

Lines changed: 54 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -839,18 +839,22 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
839839

840840
; The above tests test the types, the below tests check that the addressing
841841
; modes still function
842+
843+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
842844
define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
843845
; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16:
844846
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
845847
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
846-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
847-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
848+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
849+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
848850
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
849851
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
850-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
851-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
852-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
853-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1]
852+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
853+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
854+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
855+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
856+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
857+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #1]
854858
; VBITS_GE_2048-NEXT: ret
855859
%vals = load <32 x half>, <32 x half>* %a
856860
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -861,13 +865,19 @@ define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
861865
ret void
862866
}
863867

868+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
864869
define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
865870
; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32:
866-
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32
867-
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0]
868-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1]
869-
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0
870-
; VBITS_GE_2048-NEXT: st1w { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #2]
871+
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
872+
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
873+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
874+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
875+
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
876+
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1
877+
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s
878+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0
879+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s
880+
; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2]
871881
; VBITS_GE_2048-NEXT: ret
872882
%vals = load <32 x float>, <32 x float>* %a
873883
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -878,14 +888,14 @@ define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b
878888
ret void
879889
}
880890

891+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
881892
define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
882893
; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64:
883-
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
884-
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
885-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
886-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
887-
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
888-
; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, sxtw #3]
894+
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
895+
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG]]/z, [x0]
896+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1]
897+
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG]]/z, [[VALS]].d, #0.0
898+
; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #3]
889899
; VBITS_GE_2048-NEXT: ret
890900
%vals = load <32 x double>, <32 x double>* %a
891901
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -896,18 +906,21 @@ define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %
896906
ret void
897907
}
898908

909+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
899910
define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
900911
; CHECK-LABEL: masked_scatter_32b_scaled_zext:
901912
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
902913
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
903-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
904-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
914+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
915+
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
905916
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
906917
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
907-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
908-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
909-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
910-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1]
918+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
919+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
920+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
921+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
922+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
923+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #1]
911924
; VBITS_GE_2048-NEXT: ret
912925
%vals = load <32 x half>, <32 x half>* %a
913926
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -918,18 +931,21 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal
918931
ret void
919932
}
920933

934+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
921935
define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
922936
; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
923937
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
924938
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
925-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
926-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
939+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
940+
; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
927941
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
928942
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
929-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
930-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
931-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
932-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw]
943+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
944+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
945+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
946+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
947+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
948+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[PTRS]].d]
933949
; VBITS_GE_2048-NEXT: ret
934950
%vals = load <32 x half>, <32 x half>* %a
935951
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -941,18 +957,21 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i
941957
ret void
942958
}
943959

960+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
944961
define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
945962
; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
946963
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
947964
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
948-
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
949-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
965+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
966+
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
950967
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
951968
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
952-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
953-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
954-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
955-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw]
969+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
970+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
971+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
972+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
973+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
974+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[PTRS]].d]
956975
; VBITS_GE_2048-NEXT: ret
957976
%vals = load <32 x half>, <32 x half>* %a
958977
%idxs = load <32 x i32>, <32 x i32>* %b

0 commit comments

Comments
 (0)