-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LV] Support binary and unary operations with EVL-vectorization #93854
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV] Support binary and unary operations with EVL-vectorization #93854
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Kolya Panchenko (nikolaypanchenko) ChangesThe patch adds Patch is 138.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93854.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e75a1de548f7d..f1381b99ed68f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -855,6 +855,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenCastSC:
case VPRecipeBase::VPWidenGEPSC:
case VPRecipeBase::VPWidenSC:
+ case VPRecipeBase::VPWidenEVLSC:
case VPRecipeBase::VPWidenSelectSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
@@ -1039,6 +1040,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
@@ -1333,13 +1335,18 @@ class VPInstruction : public VPRecipeWithIRFlags {
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
class VPWidenRecipe : public VPRecipeWithIRFlags {
+protected:
unsigned Opcode;
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
+ iterator_range<IterT> Operands)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {}
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
- : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
- Opcode(I.getOpcode()) {}
+ : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
~VPWidenRecipe() override = default;
@@ -1363,6 +1370,49 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
#endif
};
+class VPWidenEVLRecipe : public VPWidenRecipe {
+private:
+ using VPRecipeWithIRFlags::transferFlags;
+
+public:
+ template <typename IterT>
+ VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL)
+ : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) {
+ addOperand(&EVL);
+ }
+
+ ~VPWidenEVLRecipe() override = default;
+
+ VPWidenRecipe *clone() override final {
+ SmallVector<VPValue *> Ops(operands());
+ VPValue *EVL = Ops.pop_back_val();
+ auto *R = new VPWidenEVLRecipe(*getUnderlyingInstr(),
+ make_range(Ops.begin(), Ops.end()), *EVL);
+ R->transferFlags(*this);
+ return R;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC);
+
+ VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
+ const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
+
+ /// A helper function to create widen EVL recipe from regular widen recipe.
+ static VPWidenEVLRecipe *create(VPWidenRecipe *W, VPValue &EVL);
+
+ /// Produce widened copies of all Ingredients.
+ void execute(VPTransformState &State) override final;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override final;
+#endif
+};
+
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
class VPWidenCastRecipe : public VPRecipeWithIRFlags {
/// Cast instruction opcode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5eb99ffd1e10e..a6fd26b666501 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -71,6 +72,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -110,6 +112,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -157,6 +160,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -993,6 +997,64 @@ void VPWidenRecipe::execute(VPTransformState &State) {
#endif
}
+VPWidenEVLRecipe *VPWidenEVLRecipe::create(VPWidenRecipe *W, VPValue &EVL) {
+ auto *R = new VPWidenEVLRecipe(*W->getUnderlyingInstr(), W->operands(), EVL);
+ R->transferFlags(*W);
+ return R;
+}
+
+void VPWidenEVLRecipe::execute(VPTransformState &State) {
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ VPValue *Op0 = getOperand(0);
+
+ // If it's scalar operation, hand translation over to VPWidenRecipe
+ if (!State.get(Op0, 0)->getType()->isVectorTy())
+ return VPWidenRecipe::execute(State);
+
+ VPValue *EVL = getEVL();
+ Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
+ unsigned Opcode = getOpcode();
+ Instruction *I = getUnderlyingInstr();
+ IRBuilderBase &BuilderIR = State.Builder;
+ VectorBuilder Builder(BuilderIR);
+ Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
+ Value *VPInst = nullptr;
+
+ //===------------------- Binary and Unary Ops ---------------------===//
+ if (Instruction::isBinaryOp(Opcode) || Instruction::isUnaryOp(Opcode)) {
+ // Just widen unops and binops.
+
+ SmallVector<Value *, 4> Ops;
+ for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+ VPValue *VPOp = getOperand(I);
+ Ops.push_back(State.get(VPOp, 0));
+ }
+
+ Builder.setMask(Mask).setEVL(EVLArg);
+ VPInst = Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops,
+ "vp.op");
+
+ if (I)
+ if (auto *VecOp = dyn_cast<Instruction>(VPInst))
+ VecOp->copyIRFlags(I);
+ } else {
+ llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
+ }
+ State.set(this, VPInst, 0);
+ State.addMetadata(VPInst, I);
+}
+
+bool VPWidenEVLRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+ assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
+ // EVL in that recipe is always the last operand, thus any use before means
+ // the VPValue should be vectorized.
+ for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I)
+ if (getOperand(I) == Op)
+ return false;
+ return true;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -1002,6 +1064,15 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
printFlags(O);
printOperands(O, SlotTracker);
}
+
+void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN vp ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(Opcode);
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
#endif
void VPWidenCastRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 422579ea8b84f..39ebd44909ea6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -20,6 +20,7 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
@@ -1219,7 +1220,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
/// WideCanonicalIV, backedge-taken-count) pattern.
/// TODO: Introduce explicit recipe for header-mask instead of searching
/// for the header-mask pattern manually.
-static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
+static DenseSet<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
SmallVector<VPValue *> WideCanonicalIVs;
auto *FoundWidenCanonicalIVUser =
find_if(Plan.getCanonicalIV()->users(),
@@ -1245,7 +1246,7 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
// Walk users of wide canonical IVs and collect to all compares of the form
// (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
- SmallVector<VPValue *> HeaderMasks;
+ DenseSet<VPValue *> HeaderMasks;
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
for (auto *Wide : WideCanonicalIVs) {
for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
@@ -1257,7 +1258,7 @@ static SmallVector<VPValue *> collectAllHeaderMasks(VPlan &Plan) {
assert(HeaderMask->getOperand(0) == Wide &&
"WidenCanonicalIV must be the first operand of the compare");
- HeaderMasks.push_back(HeaderMask);
+ HeaderMasks.insert(HeaderMask);
}
}
return HeaderMasks;
@@ -1296,6 +1297,55 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Replace recipes with their EVL variants.
+static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+ DenseSet<VPRecipeBase *> ToRemove;
+
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ DenseSet<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+ for (VPBasicBlock *VPBB : reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) {
+ // The recipes in the block are processed in reverse order, to catch chains
+ // of dead recipes.
+ for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
+ TypeSwitch<VPRecipeBase *>(&R)
+ .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+ VPValue *NewMask =
+ HeaderMasks.contains(L->getMask()) ? nullptr : L->getMask();
+ auto *N = new VPWidenLoadEVLRecipe(L, &EVL, NewMask);
+ N->insertBefore(L);
+ L->replaceAllUsesWith(N);
+ ToRemove.insert(L);
+ })
+ .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+ VPValue *NewMask =
+ HeaderMasks.contains(S->getMask()) ? nullptr : S->getMask();
+ auto *N = new VPWidenStoreEVLRecipe(S, &EVL, NewMask);
+ N->insertBefore(S);
+ ToRemove.insert(S);
+ })
+ .Case<VPWidenRecipe>([&](VPWidenRecipe *W) {
+ unsigned Opcode = W->getOpcode();
+ if (!Instruction::isBinaryOp(Opcode) &&
+ !Instruction::isUnaryOp(Opcode))
+ return;
+ auto *N = VPWidenEVLRecipe::create(W, EVL);
+ N->insertBefore(W);
+ W->replaceAllUsesWith(N);
+ ToRemove.insert(W);
+ });
+ }
+ }
+
+ for (VPRecipeBase *R : ToRemove)
+ R->eraseFromParent();
+
+ for (VPValue *HeaderMask : HeaderMasks)
+ recursivelyDeleteDeadRecipes(HeaderMask);
+}
+
+
+
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
@@ -1356,29 +1406,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
NextEVLIV->insertBefore(CanonicalIVIncrement);
EVLPhi->addOperand(NextEVLIV);
- for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
- for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U);
- if (!MemR)
- continue;
- VPValue *OrigMask = MemR->getMask();
- assert(OrigMask && "Unmasked widen memory recipe when folding tail");
- VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask;
- if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR)) {
- auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
- N->insertBefore(L);
- L->replaceAllUsesWith(N);
- L->eraseFromParent();
- } else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR)) {
- auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
- N->insertBefore(S);
- S->eraseFromParent();
- } else {
- llvm_unreachable("unsupported recipe");
- }
- }
- recursivelyDeleteDeadRecipes(HeaderMask);
- }
+ transformRecipestoEVLRecipes(Plan, *VPEVL);
+
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 8d945f6f2b8ea..a90d42289e5f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -356,6 +356,7 @@ class VPDef {
VPWidenStoreEVLSC,
VPWidenStoreSC,
VPWidenSC,
+ VPWidenEVLSC,
VPWidenSelectSC,
VPBlendSC,
// START: Phi-like recipes. Need to be kept together.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
new file mode 100644
index 0000000000000..77689fc8a76ee
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
@@ -0,0 +1,1799 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s --check-prefix=NO-VP
+
+
+define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_and(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = and i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_and(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = and i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
fa0d938
to
a42f974
Compare
CC: @ppenzin |
for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I) | ||
if (getOperand(I) == Op) | ||
return false; | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can this be simplified to return getEVL() == Op;
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's technically possible to use EVL
as first or second operand in the operation itself:
%evl = ...
%0 = add %evl, %0
i.e. that check makes sure onlyFirstLaneUsed
returns false if Op == EVL
for the example above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, looks like this corresponds to fhahn's comment about if (!State.get(Op0, 0)->getType()->isVectorTy()), regarding whether operands can be scalar."
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If EVL would be the first/second operand, it would implicitly be splatted. In any case, no such plans can be constructed at the moment I think. Better to verify EVL is only used in supported places?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will add verification
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( | ||
Plan.getEntry()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider utilizing Plan.getVectorLoopRegion()->getEntryBasicBlock()
to replace recipes that are within the loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a good point. We really need to traverse entire VPlan, but replace instructions in postexit with init vl, like for a reduction.
That said, I need to add special check that EVL is reachable before replacing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right; it reminds me that we must explicitly express InitVL in VPlan once we support reduction. This is because EVL from the last two iterations cannot be used in the post-exit for correctness. For this PR, can we temporarily skip the preheader and post-exit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to double check, this isn't needed for correctness, right?
that's correct. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to double check, this isn't needed for correctness, right?
that's correct.
Great, could you add more detail about the motivation of the change to the PR description, i.e. why it is preferable to use the EVL version instead of regular wide ops?
for (unsigned I = 0, E = getNumOperands() - 1; I != E; ++I) | ||
if (getOperand(I) == Op) | ||
return false; | ||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If EVL would be the first/second operand, it would implicitly be splatted. In any case, no such plans can be constructed at the moment I think. Better to verify EVL is only used in supported places?
b47b6a3
to
5311f92
Compare
5311f92
to
7ad2abe
Compare
// Verify that \p EVL is used correctly. The user must be either in EVL-based | ||
// recipes as a last operand or VPInstruction::Add which is incoming value | ||
// into EVL's recipe. | ||
bool verifyEVLRecipe(const VPInstruction &EVL) const; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be split off, and submitted first?
VPlanTransforms::truncateToMinimalBitwidths( | ||
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); | ||
VPlanTransforms::optimize(*Plan, *PSE.getSE()); | ||
// TODO: try to put it close to addActiveLaneMask(). | ||
// Discard the plan if it is not EVL-compatible | ||
if (CM.foldTailWithEVL() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is independent of the unary/binary support, right? Can be split off + a test that forces VF =1 and IC> 1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does relate to that change. Without it we create EVL recipes for scalar VPlan. For load/store that is not a problem, but it's a problem for WidenEVL
since it cannot handle VF=1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, does it make sense to create VPlans with EVL in general? IIUC it doesn't and this could be done independent of the patch.
widen loads and stores shouldn't be created in a scalar plan, I guess at the moment this just works as there are no recipes that can be in scalar plans and get converted to an EVL version without this patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, does it make sense to create VPlans with EVL in general? IIUC it doesn't and this could be done independent of the patch.
Not sure I follow.
widen loads and stores shouldn't be created in a scalar plan, I guess at the moment this just works as there are no recipes that can be in scalar plans and get converted to an EVL version without this patch.
I double checked the code and looks like I got confused with VPInstruction
. So I agree VPWidenRecipe
won't be a problem. Will move it to a separate patch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds great!
52b9548
to
e0c7e7c
Compare
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { | ||
VPDominatorTree VPDT; | ||
VPDT.recalculate(Plan); | ||
DenseSet<VPRecipeBase *> ToRemove; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SmallVector should be sufficient?
for (VPBasicBlock *VPBB : | ||
reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { | ||
for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { | ||
if (!properlyDominates(EVL.getDefiningRecipe(), &R, VPDT)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be an assert? We just earlier inserted EVL at a place that should dominate all recipes handled below?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's related to comment above: there are some cases when preheader or postexit may technically contain evl-based recipes, but they do require EVL, but it should be constructed before.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
will remove it for now as it does assert work for VF=1.
Plan.getEntry()); | ||
DenseSet<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan); | ||
for (VPBasicBlock *VPBB : | ||
reverse(VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is an RPOT & reverse needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, otherwise recipes won't be removed correctly.
BTW, the comment I removed was copied with a loop from removeDeadRecipes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure, I think the reason removeDeadRecipes does it in that order to remove later dead recipes before their users, but the current patch removes all replaced recipes, then removes the masks. The only operand that isn't used any longer when removing the original recipes is the mask, so I am not sure in what cases the order matters?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree. removed
VPlanTransforms::truncateToMinimalBitwidths( | ||
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); | ||
VPlanTransforms::optimize(*Plan, *PSE.getSE()); | ||
// TODO: try to put it close to addActiveLaneMask(). | ||
// Discard the plan if it is not EVL-compatible | ||
if (CM.foldTailWithEVL() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, does it make sense to create VPlans with EVL in general? IIUC it doesn't and this could be done independent of the patch.
widen loads and stores shouldn't be created in a scalar plan, I guess at the moment this just works as there are no recipes that can be in scalar plans and get converted to an EVL version without this patch.
e0c7e7c
to
dd1f74f
Compare
dd1f74f
to
5098471
Compare
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( | ||
Plan.getEntry()); | ||
DenseSet<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan); | ||
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RPOT still needed?
@@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { | |||
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]> | |||
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]> | |||
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]> | |||
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> | |||
; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this also match the EVL operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree. Fixed.
5098471
to
7312b20
Compare
@fhahn ping |
public: | ||
template <typename IterT> | ||
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) | ||
: VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), | ||
Opcode(I.getOpcode()) {} | ||
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should probably have a custom classof
is isa<VPWidenRecipe>
returns true for both VPWidenRecipe and VPWidenEVLRecipe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why ? Was the goal of having dedicated EVL-recipes to prevent treating them as non-EVL ?
Should VPWidenLoad
also return true for VPWidenLoadEVL
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The main benefit from having a shared base-class is so analyses don't have to handle all recipes when it makes sense.
I think analyses that apply to VPWidenRecipe should also conservatively apply to WPWidenEVLRecipe, as the later only possibly operates on fewer values. If that's not sound, we probably shouldn't inherit from VPWidenRecipe
without also implementing the corresponding isa
relationship.
VPWidenLoad/VPWidenLoadEVL only share VPWidenMemoryRecipe
as common base-class, for which all VPWidenLoad|Store? return true.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume than you really meant to introduce base class for VPWidenRecipe
and VPWidenEVLRecipe
that will return true for both of them, right ? In this case class hierarchy will be similar to VPWiden[Load|Store][|EVL]
.
If so, same should go to all future EVL-recipes:
VPSomeRecipeBase
/ \
VPSomeRecipe VPSomeEVLRecipe
VPSomeRecipeBase::classof(...) { return Opcode == VPSomeRecipeSC || Opcode == VPSomeEVLRecipeSC; }
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally VPWidenRecipe could serve as such a base class as mentioned above, unless there’s a compelling reason not to. That way we automatically benefit from all folds and analysis already implemented for VPWidenRecipe
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure I understand added hierarchy then. If the goal is to allow reuse of existing code, then what is different in VPWidenStoreEVL
or VPWidenLoadEVL
as they won't be treated as VPWidenStore
or VPWidenLoad
respectively:
isa<VPWidenMemoryRecipe>(EVLStore) => true
isa<VPWidenStoreEVLRecipe>(EVLStore) => true
isa<VPWidenStoreRecipe>(EVLStore) => false
Anyway, I'm ok to extend classof
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the update. Adjusting VPWidenStoreEVLRecipe
/VPWidenLoadEVLRecipe
would probably make sense separately, although the reasoning may be a bit more complicated.
public: | ||
template <typename IterT> | ||
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) | ||
: VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), | ||
Opcode(I.getOpcode()) {} | ||
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The main benefit from having a shared base-class is so analyses don't have to handle all recipes when it makes sense.
I think analyses that apply to VPWidenRecipe should also conservatively apply to WPWidenEVLRecipe, as the later only possibly operates on fewer values. If that's not sound, we probably shouldn't inherit from VPWidenRecipe
without also implementing the corresponding isa
relationship.
VPWidenLoad/VPWidenLoadEVL only share VPWidenMemoryRecipe
as common base-class, for which all VPWidenLoad|Store? return true.
@@ -2558,14 +2605,19 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { | |||
/// using the address to load from, the explicit vector length and an optional | |||
/// mask. | |||
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { | |||
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) | |||
VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Mask, VPValue &EVL) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are those changes intentional? If so, would be good to split off, but as the mask is the optional last operand, the current order seems to be intentional (Mask could have nullptr as default arg)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
revert
80c43db
to
661356c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the latest updates, a few final comments inline
@@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt | |||
; IF-EVL: vector.ph: | |||
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() | |||
; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 | |||
; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1 | |||
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] | |||
; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are the changes here caused by the patch? After a quick spot check, it seems only related to CHECK variable renames? If so would be good to remove them from the patch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
most likely is an artifact from some of my previous commits
public: | ||
template <typename IterT> | ||
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) | ||
: VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), | ||
Opcode(I.getOpcode()) {} | ||
: VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the update. Adjusting VPWidenStoreEVLRecipe
/VPWidenLoadEVLRecipe
would probably make sense separately, although the reasoning may be a bit more complicated.
c8afdb5
to
6af8c2e
Compare
bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
// EVL in that recipe is always the last operand, thus any use before means |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to post a follow-up patch to enforce EVL only used as last operand of various recipes in the verifier
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure. I will create PR with verification I added and removed previously
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great thanks!
71c9936
to
da6c1f2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks! A few small additional suggestions inline
bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
// EVL in that recipe is always the last operand, thus any use before means |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great thanks!
The patch adds `VPWidenEVLRecipe` which represents `VPWidenRecipe` + EVL argument. The new recipe replaces `VPWidenRecipe` in `tryAddExplicitVectorLength` for each binary and unary operations. Follow up patches will extend support for remaining cases, like `FCmp` and `ICmp`
supported by vp intrinsics
VP intrinsics can only accept FMFs at this moment, thus trying to set other flags will lead to ICE
234479d
to
1f8726e
Compare
@nikolaypanchenko can merge this patch? |
I've fixed an unused variable warning from this PR with ce192b8. |
thanks. I'll double check why I missed it in the first place |
The patch adds
VPWidenEVLRecipe
which representsVPWidenRecipe
+ EVL argument. The new recipe replacesVPWidenRecipe
intryAddExplicitVectorLength
for each binary and unary operations. Follow up patches will extend support for remaining cases, likeFCmp
andICmp