Skip to content

Commit 6e51ceb

Browse files
committed
[AArch64][SVE] Add intrinsics for gather loads with 64-bit offsets
This patch adds the following intrinsics for gather loads with 64-bit offsets: * @llvm.aarch64.sve.ld1.gather (unscaled offset) * @llvm.aarch64.sve.ld1.gather.index (scaled offset) These intrinsics map 1-1 to the following AArch64 instructions respectively (examples for half-words): * ld1h { z0.d }, p0/z, [x0, z0.d] * ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] Committing on behalf of Andrzej Warzynski (andwar) Reviewers: sdesmalen, huntergr, rovka, mgudim, dancgr, rengolin, efriedma Reviewed By: efriedma Tags: #llvm Differential Revision: https://reviews.llvm.org/D70542
1 parent 409350d commit 6e51ceb

9 files changed

+310
-29
lines changed

llvm/include/llvm/IR/IntrinsicsAArch64.td

+19
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,15 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
942942
llvm_i32_ty],
943943
[IntrNoMem]>;
944944

945+
class AdvSIMD_GatherLoad_64bitOffset_Intrinsic
946+
: Intrinsic<[llvm_anyvector_ty],
947+
[
948+
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
949+
LLVMPointerToElt<0>,
950+
LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>
951+
],
952+
[IntrReadMem, IntrArgMemOnly]>;
953+
945954
// This class of intrinsics are not intended to be useful within LLVM IR but
946955
// are instead here to support some of the more regid parts of the ACLE.
947956
class Builtin_SVCVT<string name, LLVMType OUT, LLVMType IN>
@@ -1172,4 +1181,14 @@ def int_aarch64_sve_ucvtf_f64i32 : Builtin_SVCVT<"svcvt_f64_u32_m", llvm_nxv2
11721181

11731182
def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic;
11741183
def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic;
1184+
1185+
//
1186+
// Gather loads:
1187+
//
1188+
1189+
// scalar + vector, 64 bit unscaled offsets
1190+
def int_aarch64_sve_ld1_gather : AdvSIMD_GatherLoad_64bitOffset_Intrinsic;
1191+
1192+
// scalar + vector, 64 bit scaled offsets
1193+
def int_aarch64_sve_ld1_gather_index : AdvSIMD_GatherLoad_64bitOffset_Intrinsic;
11751194
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+85
Original file line numberDiff line numberDiff line change
@@ -1336,6 +1336,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
13361336
case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
13371337
case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
13381338
case AArch64ISD::INSR: return "AArch64ISD::INSR";
1339+
case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
1340+
case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
13391341
}
13401342
return nullptr;
13411343
}
@@ -11760,6 +11762,85 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
1176011762
DAG.getConstant(MinOffset, DL, MVT::i64));
1176111763
}
1176211764

11765+
// Returns an SVE type that ContentTy can be trivially sign or zero extended
11766+
// into.
11767+
static MVT getSVEContainerType(EVT ContentTy) {
11768+
assert(ContentTy.isSimple() && "No SVE containers for extended types");
11769+
11770+
switch (ContentTy.getSimpleVT().SimpleTy) {
11771+
default:
11772+
llvm_unreachable("No known SVE container for this MVT type");
11773+
case MVT::nxv2i8:
11774+
case MVT::nxv2i16:
11775+
case MVT::nxv2i32:
11776+
case MVT::nxv2i64:
11777+
case MVT::nxv2f32:
11778+
case MVT::nxv2f64:
11779+
return MVT::nxv2i64;
11780+
case MVT::nxv4i8:
11781+
case MVT::nxv4i16:
11782+
case MVT::nxv4i32:
11783+
case MVT::nxv4f32:
11784+
return MVT::nxv4i32;
11785+
}
11786+
}
11787+
11788+
static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
11789+
unsigned Opcode) {
11790+
EVT RetVT = N->getValueType(0);
11791+
assert(RetVT.isScalableVector() &&
11792+
"Gather loads are only possible for SVE vectors");
11793+
11794+
SDLoc DL(N);
11795+
MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
11796+
unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();
11797+
11798+
EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
11799+
if (RetVT.getSizeInBits().getKnownMinSize() >
11800+
MaxVT.getSizeInBits().getKnownMinSize())
11801+
return SDValue();
11802+
11803+
// Depending on the addressing mode, this is either a pointer or a vector of
11804+
// pointers (that fits into one register)
11805+
const SDValue Base = N->getOperand(3);
11806+
// Depending on the addressing mode, this is either a single offset or a
11807+
// vector of offsets (that fits into one register)
11808+
const SDValue Offset = N->getOperand(4);
11809+
11810+
if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) ||
11811+
!DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
11812+
return SDValue();
11813+
11814+
// Return value type that is representable in hardware
11815+
EVT HwRetVt = getSVEContainerType(RetVT);
11816+
11817+
// Keep the original output value type around - this will better inform
11818+
// optimisations (e.g. instruction folding when load is followed by
11819+
// zext/sext). This will only be used for ints, so the value for FPs
11820+
// doesn't matter.
11821+
SDValue OutVT = DAG.getValueType(RetVT);
11822+
if (RetVT.isFloatingPoint())
11823+
OutVT = DAG.getValueType(HwRetVt);
11824+
11825+
SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
11826+
SDValue Ops[] = {N->getOperand(0), // Chain
11827+
N->getOperand(2), // Pg
11828+
Base, Offset, OutVT};
11829+
11830+
SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
11831+
SDValue LoadChain = SDValue(Load.getNode(), 1);
11832+
11833+
if (RetVT.isInteger() && (RetVT != HwRetVt))
11834+
Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
11835+
11836+
// If the original return value was FP, bitcast accordingly. Doing it here
11837+
// means that we can avoid adding TableGen patterns for FPs.
11838+
if (RetVT.isFloatingPoint())
11839+
Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
11840+
11841+
return DAG.getMergeValues({Load, LoadChain}, DL);
11842+
}
11843+
1176311844
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1176411845
DAGCombinerInfo &DCI) const {
1176511846
SelectionDAG &DAG = DCI.DAG;
@@ -11846,6 +11927,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1184611927
case Intrinsic::aarch64_neon_st3lane:
1184711928
case Intrinsic::aarch64_neon_st4lane:
1184811929
return performNEONPostLDSTCombine(N, DCI, DAG);
11930+
case Intrinsic::aarch64_sve_ld1_gather:
11931+
return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
11932+
case Intrinsic::aarch64_sve_ld1_gather_index:
11933+
return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
1184911934
default:
1185011935
break;
1185111936
}

llvm/lib/Target/AArch64/AArch64ISelLowering.h

+4
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ enum NodeType : unsigned {
198198

199199
INSR,
200200

201+
// Unsigned gather loads.
202+
GLD1,
203+
GLD1_SCALED,
204+
201205
// NEON Load/Store with post-increment base updates
202206
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
203207
LD3post,

llvm/lib/Target/AArch64/AArch64InstrFormats.td

+16-3
Original file line numberDiff line numberDiff line change
@@ -358,24 +358,37 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
358358
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
359359
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
360360

361+
def UImmS2XForm : SDNodeXForm<imm, [{
362+
return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
363+
}]>;
364+
def UImmS4XForm : SDNodeXForm<imm, [{
365+
return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64);
366+
}]>;
367+
def UImmS8XForm : SDNodeXForm<imm, [{
368+
return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
369+
}]>;
370+
361371
// uimm5sN predicate - True if the immediate is a multiple of N in the range
362372
// [0 * N, 32 * N].
363373
def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
364374
def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
365375
def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
366376

367377
def uimm5s2 : Operand<i64>, ImmLeaf<i64,
368-
[{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> {
378+
[{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
379+
UImmS2XForm> {
369380
let ParserMatchClass = UImm5s2Operand;
370381
let PrintMethod = "printImmScale<2>";
371382
}
372383
def uimm5s4 : Operand<i64>, ImmLeaf<i64,
373-
[{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> {
384+
[{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
385+
UImmS4XForm> {
374386
let ParserMatchClass = UImm5s4Operand;
375387
let PrintMethod = "printImmScale<4>";
376388
}
377389
def uimm5s8 : Operand<i64>, ImmLeaf<i64,
378-
[{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> {
390+
[{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
391+
UImmS8XForm> {
379392
let ParserMatchClass = UImm5s8Operand;
380393
let PrintMethod = "printImmScale<8>";
381394
}

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

+32-24
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
14+
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
15+
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
16+
]>;
17+
18+
def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
19+
def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
20+
1321
let Predicates = [HasSVE] in {
1422

1523
def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
@@ -454,33 +462,33 @@ let Predicates = [HasSVE] in {
454462

455463
// Gathers using unscaled 64-bit offsets, e.g.
456464
// ld1h z0.d, p0/z, [x0, z0.d]
457-
defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
458-
defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
459-
defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
460-
defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
461-
defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
462-
defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
463-
defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
464-
defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
465-
defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
466-
defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
467-
defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
468-
defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
469-
defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
470-
defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
465+
defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", null_frag, nxv2i8>;
466+
defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>;
467+
defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>;
468+
defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>;
469+
defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", null_frag, nxv2i16>;
470+
defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>;
471+
defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>;
472+
defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>;
473+
defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", null_frag, nxv2i32>;
474+
defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>;
475+
defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>;
476+
defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>;
477+
defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>;
478+
defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>;
471479

472480
// Gathers using scaled 64-bit offsets, e.g.
473481
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
474-
defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>;
475-
defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
476-
defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>;
477-
defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>;
478-
defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>;
479-
defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
480-
defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>;
481-
defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>;
482-
defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>;
483-
defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>;
482+
defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
483+
defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
484+
defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
485+
defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>;
486+
defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
487+
defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
488+
defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
489+
defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>;
490+
defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
491+
defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>;
484492

485493
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
486494
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]

llvm/lib/Target/AArch64/SVEInstrFormats.td

+10-2
Original file line numberDiff line numberDiff line change
@@ -5584,18 +5584,26 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
55845584
}
55855585

55865586
multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
5587-
RegisterOperand zprext> {
5587+
SDPatternOperator op,
5588+
RegisterOperand zprext, ValueType vt> {
55885589
def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
55895590

55905591
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
55915592
(!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
5593+
5594+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
5595+
(!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
55925596
}
55935597

5594-
multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
5598+
multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
5599+
SDPatternOperator op, ValueType vt> {
55955600
def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
55965601

55975602
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
55985603
(!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
5604+
5605+
def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
5606+
(!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
55995607
}
56005608

56015609
class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>

llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h

+11
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,17 @@ namespace AArch64II {
643643
};
644644
} // end namespace AArch64II
645645

646+
namespace AArch64 {
647+
// The number of bits in a SVE register is architecturally defined
648+
// to be a multiple of this value. If <M x t> has this number of bits,
649+
// a <n x M x t> vector can be stored in a SVE register without any
650+
// redundant bits. If <M x t> has this number of bits divided by P,
651+
// a <n x M x t> vector is stored in a SVE register by placing index i
652+
// in index i*P of a <n x (M*P) x t> vector. The other elements of the
653+
// <n x (M*P) x t> vector (such as index 1) are undefined.
654+
static constexpr unsigned SVEBitsPerBlock = 128;
655+
} // end namespace AArch64
656+
646657
} // end namespace llvm
647658

648659
#endif
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
2+
3+
;
4+
; LD1H, LD1W, LD1D: base + 64-bit scaled offset
5+
; e.g. ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
6+
;
7+
8+
define <vscale x 2 x i64> @gld1h_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
9+
; CHECK-LABEL: gld1h_index
10+
; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
11+
; CHECK-NEXT: mov w8, #65535
12+
; CHECK-NEXT: mov z1.d, x8
13+
; CHECK-NEXT: and z0.d, z0.d, z1.d
14+
; CHECK-NEXT: ret
15+
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
16+
i16* %base,
17+
<vscale x 2 x i64> %b)
18+
%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
19+
ret <vscale x 2 x i64> %res
20+
}
21+
22+
define <vscale x 2 x i64> @gld1w_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
23+
; CHECK-LABEL: gld1w_index
24+
; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
25+
; CHECK-NEXT: mov w8, #-1
26+
; CHECK-NEXT: mov z1.d, x8
27+
; CHECK-NEXT: and z0.d, z0.d, z1.d
28+
; CHECK-NEXT: ret
29+
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
30+
i32* %base,
31+
<vscale x 2 x i64> %b)
32+
%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
33+
ret <vscale x 2 x i64> %res
34+
}
35+
36+
define <vscale x 2 x i64> @gld1d_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
37+
; CHECK-LABEL: gld1d_index
38+
; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
39+
; CHECK-NEXT: ret
40+
%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
41+
i64* %base,
42+
<vscale x 2 x i64> %b)
43+
ret <vscale x 2 x i64> %load
44+
}
45+
46+
define <vscale x 2 x double> @gld1d_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
47+
; CHECK-LABEL: gld1d_index_double
48+
; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
49+
; CHECK-NEXT: ret
50+
%load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
51+
double* %base,
52+
<vscale x 2 x i64> %b)
53+
ret <vscale x 2 x double> %load
54+
}
55+
56+
declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
57+
declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
58+
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
59+
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)

0 commit comments

Comments
 (0)