Skip to content

Commit 08decd2

Browse files
authored
[WebAssembly] load_zero to initialise build_vector (#100610)
Instead of splatting a single lane, to initialise a build_vector, lower to scalar_to_vector which can be selected to load_zero. Also add load_zero and load_lane patterns for f32x4 and f64x2.
1 parent 3c3851f commit 08decd2

File tree

4 files changed

+111
-30
lines changed

4 files changed

+111
-30
lines changed

llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
177177
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I32)
178178
WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I64)
179179
WASM_LOAD_STORE(LOAD8_SPLAT)
180-
WASM_LOAD_STORE(LOAD_LANE_I8x16)
180+
WASM_LOAD_STORE(LOAD_LANE_8)
181181
WASM_LOAD_STORE(STORE_LANE_I8x16)
182182
return 0;
183183
WASM_LOAD_STORE(LOAD16_S_I32)
@@ -205,7 +205,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
205205
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I32)
206206
WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I64)
207207
WASM_LOAD_STORE(LOAD16_SPLAT)
208-
WASM_LOAD_STORE(LOAD_LANE_I16x8)
208+
WASM_LOAD_STORE(LOAD_LANE_16)
209209
WASM_LOAD_STORE(STORE_LANE_I16x8)
210210
WASM_LOAD_STORE(LOAD_F16_F32)
211211
WASM_LOAD_STORE(STORE_F16_F32)
@@ -238,8 +238,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
238238
WASM_LOAD_STORE(MEMORY_ATOMIC_NOTIFY)
239239
WASM_LOAD_STORE(MEMORY_ATOMIC_WAIT32)
240240
WASM_LOAD_STORE(LOAD32_SPLAT)
241-
WASM_LOAD_STORE(LOAD_ZERO_I32x4)
242-
WASM_LOAD_STORE(LOAD_LANE_I32x4)
241+
WASM_LOAD_STORE(LOAD_ZERO_32)
242+
WASM_LOAD_STORE(LOAD_LANE_32)
243243
WASM_LOAD_STORE(STORE_LANE_I32x4)
244244
return 2;
245245
WASM_LOAD_STORE(LOAD_I64)
@@ -263,8 +263,8 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
263263
WASM_LOAD_STORE(LOAD_EXTEND_U_I32x4)
264264
WASM_LOAD_STORE(LOAD_EXTEND_S_I64x2)
265265
WASM_LOAD_STORE(LOAD_EXTEND_U_I64x2)
266-
WASM_LOAD_STORE(LOAD_ZERO_I64x2)
267-
WASM_LOAD_STORE(LOAD_LANE_I64x2)
266+
WASM_LOAD_STORE(LOAD_ZERO_64)
267+
WASM_LOAD_STORE(LOAD_LANE_64)
268268
WASM_LOAD_STORE(STORE_LANE_I64x2)
269269
return 3;
270270
WASM_LOAD_STORE(LOAD_V128)

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2275,8 +2275,15 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
22752275
return IsConstant(Lane);
22762276
};
22772277
} else {
2278-
// Use a splat (which might be selected as a load splat)
2279-
Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
2278+
size_t DestLaneSize = VecT.getVectorElementType().getFixedSizeInBits();
2279+
if (NumSplatLanes == 1 && (DestLaneSize == 32 || DestLaneSize == 64)) {
2280+
// Could be selected to load_zero.
2281+
assert(SplatValue == Op->getOperand(0));
2282+
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecT, SplatValue);
2283+
} else {
2284+
// Use a splat (which might be selected as a load splat)
2285+
Result = DAG.getSplatBuildVector(VecT, DL, SplatValue);
2286+
}
22802287
IsLaneConstructed = [&SplatValue](size_t _, const SDValue &Lane) {
22812288
return Lane == SplatValue;
22822289
};

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -273,13 +273,13 @@ defm : LoadPat<vec.vt, loadpat, inst>;
273273
multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
274274
defvar name = "v128.load"#vec.lane_bits#"_zero";
275275
let mayLoad = 1, UseNamedOperandTable = 1 in {
276-
defm LOAD_ZERO_#vec#_A32 :
276+
defm LOAD_ZERO_#vec.lane_bits#_A32 :
277277
SIMD_I<(outs V128:$dst),
278278
(ins P2Align:$p2align, offset32_op:$off, I32:$addr),
279279
(outs), (ins P2Align:$p2align, offset32_op:$off), [],
280280
name#"\t$dst, ${off}(${addr})$p2align",
281281
name#"\t$off$p2align", simdop>;
282-
defm LOAD_ZERO_#vec#_A64 :
282+
defm LOAD_ZERO_#vec.lane_bits#_A64 :
283283
SIMD_I<(outs V128:$dst),
284284
(ins P2Align:$p2align, offset64_op:$off, I64:$addr),
285285
(outs), (ins P2Align:$p2align, offset64_op:$off), [],
@@ -293,32 +293,32 @@ defm "" : SIMDLoadZero<I64x2, 0x5d>;
293293

294294
// Use load_zero to load scalars into vectors as well where possible.
295295
// TODO: i16, and i8 scalars
296-
foreach vec = [I32x4, I64x2] in {
297-
defvar inst = "LOAD_ZERO_"#vec;
296+
foreach vec = [I32x4, I64x2, F32x4, F64x2] in {
297+
defvar inst = "LOAD_ZERO_"#vec.lane_bits;
298298
defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>;
299299
defm : LoadPat<vec.vt, pat, inst>;
300300
}
301301

302302
// TODO: f32x4 and f64x2 as well
303303
foreach vec = [I32x4, I64x2] in {
304-
defvar inst = "LOAD_ZERO_"#vec;
304+
defvar inst = "LOAD_ZERO_"#vec.lane_bits;
305305
defvar pat = PatFrag<(ops node:$ptr),
306306
(vector_insert (vec.splat (vec.lane_vt 0)), (vec.lane_vt (load $ptr)), 0)>;
307307
defm : LoadPat<vec.vt, pat, inst>;
308308
}
309309

310310
// Load lane
311-
multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
312-
defvar name = "v128.load"#vec.lane_bits#"_lane";
311+
multiclass SIMDLoadLane<bits<32> lane_bits, bits<32> simdop> {
312+
defvar name = "v128.load"#lane_bits#"_lane";
313313
let mayLoad = 1, UseNamedOperandTable = 1 in {
314-
defm LOAD_LANE_#vec#_A32 :
314+
defm LOAD_LANE_#lane_bits#_A32 :
315315
SIMD_I<(outs V128:$dst),
316316
(ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx,
317317
I32:$addr, V128:$vec),
318318
(outs), (ins P2Align:$p2align, offset32_op:$off, vec_i8imm_op:$idx),
319319
[], name#"\t$dst, ${off}(${addr})$p2align, $vec, $idx",
320320
name#"\t$off$p2align, $idx", simdop>;
321-
defm LOAD_LANE_#vec#_A64 :
321+
defm LOAD_LANE_#lane_bits#_A64 :
322322
SIMD_I<(outs V128:$dst),
323323
(ins P2Align:$p2align, offset64_op:$off, vec_i8imm_op:$idx,
324324
I64:$addr, V128:$vec),
@@ -328,15 +328,15 @@ multiclass SIMDLoadLane<Vec vec, bits<32> simdop> {
328328
} // mayLoad = 1, UseNamedOperandTable = 1
329329
}
330330

331-
defm "" : SIMDLoadLane<I8x16, 0x54>;
332-
defm "" : SIMDLoadLane<I16x8, 0x55>;
333-
defm "" : SIMDLoadLane<I32x4, 0x56>;
334-
defm "" : SIMDLoadLane<I64x2, 0x57>;
331+
defm "" : SIMDLoadLane<8, 0x54>;
332+
defm "" : SIMDLoadLane<16, 0x55>;
333+
defm "" : SIMDLoadLane<32, 0x56>;
334+
defm "" : SIMDLoadLane<64, 0x57>;
335335

336336
// Select loads with no constant offset.
337337
multiclass LoadLanePatNoOffset<Vec vec, SDPatternOperator kind> {
338-
defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec#"_A32");
339-
defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec#"_A64");
338+
defvar load_lane_a32 = !cast<NI>("LOAD_LANE_"#vec.lane_bits#"_A32");
339+
defvar load_lane_a64 = !cast<NI>("LOAD_LANE_"#vec.lane_bits#"_A64");
340340
def : Pat<(vec.vt (kind (i32 I32:$addr),
341341
(vec.vt V128:$vec), (i32 vec.lane_idx:$idx))),
342342
(load_lane_a32 0, 0, imm:$idx, $addr, $vec)>,
@@ -354,17 +354,22 @@ def load16_lane :
354354
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
355355
(vector_insert $vec, (i32 (extloadi16 $ptr)), $idx)>;
356356
def load32_lane :
357-
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
358-
(vector_insert $vec, (i32 (load $ptr)), $idx)>;
357+
PatFrags<(ops node:$ptr, node:$vec, node:$idx), [
358+
(vector_insert $vec, (i32 (load $ptr)), $idx),
359+
(vector_insert $vec, (f32 (load $ptr)), $idx)
360+
]>;
359361
def load64_lane :
360-
PatFrag<(ops node:$ptr, node:$vec, node:$idx),
361-
(vector_insert $vec, (i64 (load $ptr)), $idx)>;
362-
// TODO: floating point lanes as well
362+
PatFrags<(ops node:$ptr, node:$vec, node:$idx), [
363+
(vector_insert $vec, (i64 (load $ptr)), $idx),
364+
(vector_insert $vec, (f64 (load $ptr)), $idx)
365+
]>;
363366

364367
defm : LoadLanePatNoOffset<I8x16, load8_lane>;
365368
defm : LoadLanePatNoOffset<I16x8, load16_lane>;
366369
defm : LoadLanePatNoOffset<I32x4, load32_lane>;
367370
defm : LoadLanePatNoOffset<I64x2, load64_lane>;
371+
defm : LoadLanePatNoOffset<F32x4, load32_lane>;
372+
defm : LoadLanePatNoOffset<F64x2, load64_lane>;
368373

369374
// TODO: Also support the other load patterns for load_lane once the instructions
370375
// are merged to the proposal.
@@ -1463,10 +1468,10 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
14631468
// Adapted from the body of LoadPatNoOffset
14641469
// TODO: other addressing patterns
14651470
def : Pat<(v2f64 (extloadv2f32 (i32 I32:$addr))),
1466-
(promote_low_F64x2 (LOAD_ZERO_I64x2_A32 0, 0, I32:$addr))>,
1471+
(promote_low_F64x2 (LOAD_ZERO_64_A32 0, 0, I32:$addr))>,
14671472
Requires<[HasAddr32]>;
14681473
def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))),
1469-
(promote_low_F64x2 (LOAD_ZERO_I64x2_A64 0, 0, I64:$addr))>,
1474+
(promote_low_F64x2 (LOAD_ZERO_64_A64 0, 0, I64:$addr))>,
14701475
Requires<[HasAddr64]>;
14711476

14721477
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/WebAssembly/simd-build-vector.ll

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,3 +440,72 @@ define <2 x double> @all_undef_f64x2() {
440440
; CHECK-NEXT: return $0
441441
ret <2 x double> undef
442442
}
443+
444+
define <4 x i32> @load_zero_lane_i32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
445+
; CHECK-LABEL: load_zero_lane_i32x4:
446+
; CHECK: .functype load_zero_lane_i32x4 (i32, i32, i32, i32) -> (v128)
447+
; CHECK-NEXT: # %bb.0:
448+
; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
449+
; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
450+
; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
451+
; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
452+
; CHECK-NEXT: return $pop3
453+
%a = load i32, ptr %addr.a
454+
%b = load i32, ptr %addr.b
455+
%c = load i32, ptr %addr.c
456+
%d = load i32, ptr %addr.d
457+
%v = insertelement <4 x i32> undef, i32 %a, i32 0
458+
%v.1 = insertelement <4 x i32> %v, i32 %b, i32 1
459+
%v.2 = insertelement <4 x i32> %v.1, i32 %c, i32 2
460+
%v.3 = insertelement <4 x i32> %v.2, i32 %d, i32 3
461+
ret <4 x i32> %v.3
462+
}
463+
464+
define <2 x i64> @load_zero_lane_i64x2(ptr %addr.a, ptr %addr.b) {
465+
; CHECK-LABEL: load_zero_lane_i64x2:
466+
; CHECK: .functype load_zero_lane_i64x2 (i32, i32) -> (v128)
467+
; CHECK-NEXT: # %bb.0:
468+
; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
469+
; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
470+
; CHECK-NEXT: return $pop1
471+
%a = load i64, ptr %addr.a
472+
%b = load i64, ptr %addr.b
473+
%v = insertelement <2 x i64> undef, i64 %a, i32 0
474+
%v.1 = insertelement <2 x i64> %v, i64 %b, i32 1
475+
ret <2 x i64> %v.1
476+
}
477+
478+
define <4 x float> @load_zero_lane_f32x4(ptr %addr.a, ptr %addr.b, ptr %addr.c, ptr %addr.d) {
479+
; CHECK-LABEL: load_zero_lane_f32x4:
480+
; CHECK: .functype load_zero_lane_f32x4 (i32, i32, i32, i32) -> (v128)
481+
; CHECK-NEXT: # %bb.0:
482+
; CHECK-NEXT: v128.load32_zero $push0=, 0($0)
483+
; CHECK-NEXT: v128.load32_lane $push1=, 0($1), $pop0, 1
484+
; CHECK-NEXT: v128.load32_lane $push2=, 0($2), $pop1, 2
485+
; CHECK-NEXT: v128.load32_lane $push3=, 0($3), $pop2, 3
486+
; CHECK-NEXT: return $pop3
487+
%a = load float, ptr %addr.a
488+
%b = load float, ptr %addr.b
489+
%c = load float, ptr %addr.c
490+
%d = load float, ptr %addr.d
491+
%v = insertelement <4 x float> undef, float %a, i32 0
492+
%v.1 = insertelement <4 x float> %v, float %b, i32 1
493+
%v.2 = insertelement <4 x float> %v.1, float %c, i32 2
494+
%v.3 = insertelement <4 x float> %v.2, float %d, i32 3
495+
ret <4 x float> %v.3
496+
}
497+
498+
define <2 x double> @load_zero_lane_f64x2(ptr %addr.a, ptr %addr.b) {
499+
; CHECK-LABEL: load_zero_lane_f64x2:
500+
; CHECK: .functype load_zero_lane_f64x2 (i32, i32) -> (v128)
501+
; CHECK-NEXT: # %bb.0:
502+
; CHECK-NEXT: v128.load64_zero $push0=, 0($0)
503+
; CHECK-NEXT: v128.load64_lane $push1=, 0($1), $pop0, 1
504+
; CHECK-NEXT: return $pop1
505+
%a = load double, ptr %addr.a
506+
%b = load double, ptr %addr.b
507+
%v = insertelement <2 x double> undef, double %a, i32 0
508+
%v.1 = insertelement <2 x double> %v, double %b, i32 1
509+
ret <2 x double> %v.1
510+
}
511+

0 commit comments

Comments
 (0)