Skip to content

Commit 72532c9

Browse files
authored
[LV] Don't predicate divs with invariant divisor when folding tail (#98904)
When folding the tail, at least one of the lanes must execute unconditionally. If the divisor is loop-invariant no predication is needed, as predication would not prevent the divide-by-0 on the executed lane. Depends on #98892. PR: #98904
1 parent 90a9979 commit 72532c9

File tree

5 files changed

+95
-253
lines changed

5 files changed

+95
-253
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3339,45 +3339,54 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
33393339
}
33403340
}
33413341

3342+
// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
33423343
bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3343-
if (!blockNeedsPredicationForAnyReason(I->getParent()))
3344+
// If predication is not needed, avoid it.
3345+
// TODO: We can use the loop-preheader as context point here and get
3346+
// context sensitive reasoning for isSafeToSpeculativelyExecute.
3347+
if (!blockNeedsPredicationForAnyReason(I->getParent()) ||
3348+
isSafeToSpeculativelyExecute(I) ||
3349+
(isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
3350+
isa<BranchInst, PHINode>(I))
33443351
return false;
33453352

3346-
// Can we prove this instruction is safe to unconditionally execute?
3347-
// If not, we must use some form of predication.
3353+
// If the instruction was executed conditionally in the original scalar loop,
3354+
// predication is needed with a mask whose lanes are all possibly inactive.
3355+
if (Legal->blockNeedsPredication(I->getParent()))
3356+
return true;
3357+
3358+
// All that remain are instructions with side-effects originally executed in
3359+
// the loop unconditionally, but now execute under a tail-fold mask (only)
3360+
// having at least one active lane (the first). If the side-effects of the
3361+
// instruction are invariant, executing it w/o (the tail-folding) mask is safe
3362+
// - it will cause the same side-effects as when masked.
33483363
switch(I->getOpcode()) {
33493364
default:
3350-
return false;
3365+
llvm_unreachable(
3366+
"instruction should have been considered by earlier checks");
3367+
case Instruction::Call:
3368+
// Side-effects of a Call are assumed to be non-invariant, needing a
3369+
// (fold-tail) mask.
3370+
assert(Legal->isMaskRequired(I) &&
3371+
"should have returned earlier for calls not needing a mask");
3372+
return true;
33513373
case Instruction::Load:
3374+
// If the address is loop invariant no predication is needed.
3375+
return !Legal->isInvariant(getLoadStorePointerOperand(I));
33523376
case Instruction::Store: {
3353-
if (!Legal->isMaskRequired(I))
3354-
return false;
3355-
// When we know the load's address is loop invariant and the instruction
3356-
// in the original scalar loop was unconditionally executed then we
3357-
// don't need to mark it as a predicated instruction. Tail folding may
3358-
// introduce additional predication, but we're guaranteed to always have
3359-
// at least one active lane. We call Legal->blockNeedsPredication here
3360-
// because it doesn't query tail-folding. For stores, we need to prove
3361-
// both speculation safety (which follows from the same argument as loads),
3362-
// but also must prove the value being stored is correct. The easiest
3363-
// form of the later is to require that all values stored are the same.
3364-
if (Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3365-
(isa<LoadInst>(I) ||
3366-
(isa<StoreInst>(I) &&
3367-
TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
3368-
!Legal->blockNeedsPredication(I->getParent()))
3369-
return false;
3370-
return true;
3377+
// For stores, we need to prove both speculation safety (which follows from
3378+
// the same argument as loads), but also must prove the value being stored
3379+
// is correct. The easiest form of the later is to require that all values
3380+
// stored are the same.
3381+
return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&
3382+
TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));
33713383
}
33723384
case Instruction::UDiv:
33733385
case Instruction::SDiv:
33743386
case Instruction::SRem:
33753387
case Instruction::URem:
3376-
// TODO: We can use the loop-preheader as context point here and get
3377-
// context sensitive reasoning
3378-
return !isSafeToSpeculativelyExecute(I);
3379-
case Instruction::Call:
3380-
return Legal->isMaskRequired(I);
3388+
// If the divisor is loop-invariant no predication is needed.
3389+
return !TheLoop->isLoopInvariant(I->getOperand(1));
33813390
}
33823391
}
33833392

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -274,50 +274,38 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
274274
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
275275
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
276276
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP19]]
277-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
278-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
279-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
277+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
280278
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
281-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_1_I]], i64 0
282-
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
283-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[X]], i64 0
279+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
284280
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
285281
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
286282
; CHECK: [[VECTOR_BODY]]:
287283
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
288284
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
289285
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
290-
; CHECK-NEXT: [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
291-
; CHECK-NEXT: [[TMP22:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[TMP21]]
292-
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
293-
; CHECK-NEXT: [[TMP24:%.*]] = urem <vscale x 2 x i64> [[VEC_IND]], [[TMP23]]
294-
; CHECK-NEXT: [[TMP25:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
295-
; CHECK-NEXT: [[TMP26:%.*]] = udiv <vscale x 2 x i64> [[TMP24]], [[TMP25]]
296-
; CHECK-NEXT: [[TMP27:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
297-
; CHECK-NEXT: [[TMP28:%.*]] = urem <vscale x 2 x i64> [[TMP24]], [[TMP27]]
298-
; CHECK-NEXT: [[TMP29:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
299-
; CHECK-NEXT: [[TMP30:%.*]] = udiv <vscale x 2 x i64> [[TMP28]], [[TMP29]]
300-
; CHECK-NEXT: [[TMP31:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> [[BROADCAST_SPLAT4]], <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
301-
; CHECK-NEXT: [[TMP32:%.*]] = urem <vscale x 2 x i64> [[TMP28]], [[TMP31]]
302-
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <vscale x 2 x i64> [[TMP22]], i32 0
303-
; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[X]], [[TMP33]]
304-
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <vscale x 2 x i64> [[TMP26]], i32 0
305-
; CHECK-NEXT: [[TMP36:%.*]] = add i64 [[TMP34]], [[TMP35]]
306-
; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], [[X]]
307-
; CHECK-NEXT: [[TMP38:%.*]] = extractelement <vscale x 2 x i64> [[TMP30]], i32 0
308-
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[TMP37]], [[TMP38]]
309-
; CHECK-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], [[X]]
310-
; CHECK-NEXT: [[TMP41:%.*]] = extractelement <vscale x 2 x i64> [[TMP32]], i32 0
311-
; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP40]], [[TMP41]]
312-
; CHECK-NEXT: [[TMP43:%.*]] = shl i64 [[TMP42]], 32
313-
; CHECK-NEXT: [[TMP44:%.*]] = ashr i64 [[TMP43]], 32
314-
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP44]]
315-
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i64, ptr [[TMP45]], i32 0
316-
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP22]], ptr [[TMP46]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
286+
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
287+
; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
288+
; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[TMP21]], [[MUL_2_I]]
289+
; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
290+
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
291+
; CHECK-NEXT: [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
292+
; CHECK-NEXT: [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
293+
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
294+
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
295+
; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
296+
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
297+
; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], [[TMP27]]
298+
; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], [[X]]
299+
; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[TMP34]], [[TMP28]]
300+
; CHECK-NEXT: [[TMP36:%.*]] = shl i64 [[TMP35]], 32
301+
; CHECK-NEXT: [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
302+
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
303+
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr i64, ptr [[TMP38]], i32 0
304+
; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP39]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
317305
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
318306
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
319307
; CHECK-NEXT: [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
320-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
308+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
321309
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
322310
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
323311
; CHECK: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -380,41 +380,40 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
380380
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
381381
; PRED: vector.ph:
382382
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
383-
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8
383+
; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
384384
; PRED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP2]], 1
385385
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]]
386386
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
387387
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
388388
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
389-
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
389+
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
390390
; PRED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
391-
; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8
391+
; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
392392
; PRED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]]
393393
; PRED-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]]
394394
; PRED-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
395-
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[TMP0]])
396-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
397-
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
395+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
396+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0
397+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
398398
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
399399
; PRED: vector.body:
400400
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
401-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
402-
; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
401+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
402+
; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
403403
; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
404404
; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]]
405405
; PRED-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0
406-
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
407-
; PRED-NEXT: [[TMP16:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[BROADCAST_SPLAT]], <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
408-
; PRED-NEXT: [[TMP17:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[TMP16]]
409-
; PRED-NEXT: [[TMP18:%.*]] = or <vscale x 8 x i16> [[TMP17]], [[VEC_PHI]]
410-
; PRED-NEXT: [[TMP19]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP18]], <vscale x 8 x i16> [[VEC_PHI]]
406+
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison)
407+
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
408+
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]]
409+
; PRED-NEXT: [[TMP16]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> [[TMP20]], <vscale x 4 x i16> [[VEC_PHI]]
411410
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
412-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP12]])
413-
; PRED-NEXT: [[TMP20:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
414-
; PRED-NEXT: [[TMP21:%.*]] = extractelement <vscale x 8 x i1> [[TMP20]], i32 0
415-
; PRED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
411+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]])
412+
; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
413+
; PRED-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
414+
; PRED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
416415
; PRED: middle.block:
417-
; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP19]])
416+
; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[TMP16]])
418417
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
419418
; PRED: scalar.ph:
420419
; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]

0 commit comments

Comments
 (0)