@@ -917,19 +917,22 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
917
917
; The above tests test the types, the below tests check that the addressing
918
918
; modes still function
919
919
920
+ ; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
920
921
define void @masked_gather_32b_scaled_sext_f16 (<32 x half >* %a , <32 x i32 >* %b , half * %base ) #0 {
921
922
; CHECK-LABEL: masked_gather_32b_scaled_sext_f16:
922
923
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
923
924
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
924
- ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s , vl32
925
- ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
925
+ ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d , vl32
926
+ ; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
926
927
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
927
928
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
928
- ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
929
- ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
930
- ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1]
931
- ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
932
- ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
929
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
930
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
931
+ ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
932
+ ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #1]
933
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
934
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
935
+ ; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
933
936
; VBITS_GE_2048-NEXT: ret
934
937
%cvals = load <32 x half >, <32 x half >* %a
935
938
%idxs = load <32 x i32 >, <32 x i32 >* %b
@@ -941,14 +944,20 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
941
944
ret void
942
945
}
943
946
947
+ ; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
944
948
define void @masked_gather_32b_scaled_sext_f32 (<32 x float >* %a , <32 x i32 >* %b , float * %base ) #0 {
945
949
; CHECK-LABEL: masked_gather_32b_scaled_sext_f32:
946
- ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32
947
- ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0]
948
- ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1]
949
- ; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0
950
- ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #2]
951
- ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
950
+ ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
951
+ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
952
+ ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
953
+ ; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
954
+ ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
955
+ ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1
956
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s
957
+ ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0
958
+ ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #2]
959
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
960
+ ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
952
961
; VBITS_GE_2048-NEXT: ret
953
962
%cvals = load <32 x float >, <32 x float >* %a
954
963
%idxs = load <32 x i32 >, <32 x i32 >* %b
@@ -960,15 +969,15 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b,
960
969
ret void
961
970
}
962
971
972
+ ; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
963
973
define void @masked_gather_32b_scaled_sext_f64 (<32 x double >* %a , <32 x i32 >* %b , double * %base ) #0 {
964
974
; CHECK-LABEL: masked_gather_32b_scaled_sext_f64:
965
- ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
966
- ; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
967
- ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
968
- ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
969
- ; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
970
- ; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, sxtw #3]
971
- ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
975
+ ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
976
+ ; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG]]/z, [x0]
977
+ ; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1]
978
+ ; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG]]/z, [[VALS]].d, #0.0
979
+ ; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #3]
980
+ ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
972
981
; VBITS_GE_2048-NEXT: ret
973
982
%cvals = load <32 x double >, <32 x double >* %a
974
983
%idxs = load <32 x i32 >, <32 x i32 >* %b
@@ -980,19 +989,22 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b
980
989
ret void
981
990
}
982
991
992
+ ; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
983
993
define void @masked_gather_32b_scaled_zext (<32 x half >* %a , <32 x i32 >* %b , half * %base ) #0 {
984
994
; CHECK-LABEL: masked_gather_32b_scaled_zext:
985
995
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
986
996
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
987
- ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s , vl32
988
- ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
997
+ ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d , vl32
998
+ ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
989
999
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
990
1000
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
991
- ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
992
- ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
993
- ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1]
994
- ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
995
- ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1001
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1002
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1003
+ ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
1004
+ ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #1]
1005
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1006
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1007
+ ; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
996
1008
; VBITS_GE_2048-NEXT: ret
997
1009
%cvals = load <32 x half >, <32 x half >* %a
998
1010
%idxs = load <32 x i32 >, <32 x i32 >* %b
@@ -1004,19 +1016,22 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
1004
1016
ret void
1005
1017
}
1006
1018
1019
+ ; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
1007
1020
define void @masked_gather_32b_unscaled_sext (<32 x half >* %a , <32 x i32 >* %b , i8* %base ) #0 {
1008
1021
; CHECK-LABEL: masked_gather_32b_unscaled_sext:
1009
1022
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
1010
1023
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
1011
- ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s , vl32
1012
- ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
1024
+ ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d , vl32
1025
+ ; VBITS_GE_2048-NEXT: ld1sw { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
1013
1026
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
1014
1027
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
1015
- ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
1016
- ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
1017
- ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw]
1018
- ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
1019
- ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1028
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1029
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1030
+ ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
1031
+ ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d]
1032
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1033
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1034
+ ; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
1020
1035
; VBITS_GE_2048-NEXT: ret
1021
1036
%cvals = load <32 x half >, <32 x half >* %a
1022
1037
%idxs = load <32 x i32 >, <32 x i32 >* %b
@@ -1029,19 +1044,22 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
1029
1044
ret void
1030
1045
}
1031
1046
1047
+ ; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
1032
1048
define void @masked_gather_32b_unscaled_zext (<32 x half >* %a , <32 x i32 >* %b , i8* %base ) #0 {
1033
1049
; CHECK-LABEL: masked_gather_32b_unscaled_zext:
1034
1050
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
1035
1051
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
1036
- ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s , vl32
1037
- ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
1052
+ ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d , vl32
1053
+ ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
1038
1054
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
1039
1055
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
1040
- ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
1041
- ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
1042
- ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw]
1043
- ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
1044
- ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1056
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1057
+ ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1058
+ ; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
1059
+ ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d]
1060
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1061
+ ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1062
+ ; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
1045
1063
; VBITS_GE_2048-NEXT: ret
1046
1064
%cvals = load <32 x half >, <32 x half >* %a
1047
1065
%idxs = load <32 x i32 >, <32 x i32 >* %b
0 commit comments