llvm · rohitaggarwal007 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5127,6 +5127,13 @@ class TargetLowering : public TargetLoweringBase {
                                               SmallVectorImpl<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
+  // Target may override this function to decided whether it want to update the
+  // base and index value of a non-uniform gep
+  virtual bool updateBaseAndIndex(const Value *Ptr, SDValue &Base,
+                                  SDValue &Index, const SDLoc &DL,
+                                  const SDValue &Gep, SelectionDAG &DAG,
+                                  const BasicBlock *CurBB) const;
+
   //===--------------------------------------------------------------------===//
   // Div utility functions
   //

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4905,6 +4905,11 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
 
+  if (!UniformBase) {
+    TLI.updateBaseAndIndex(Ptr, Base, Index, getCurSDLoc(), getValue(Ptr), DAG,
+                           I.getParent());
+  }
+
   EVT IdxVT = Index.getValueType();
   EVT EltTy = IdxVT.getVectorElementType();
   if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
@@ -5024,6 +5029,11 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
 
+  if (!UniformBase) {
+    TLI.updateBaseAndIndex(Ptr, Base, Index, getCurSDLoc(), getValue(Ptr), DAG,
+                           I.getParent());
+  }
+
   EVT IdxVT = Index.getValueType();
   EVT EltTy = IdxVT.getVectorElementType();
   if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5655,6 +5655,14 @@ void TargetLowering::CollectTargetIntrinsicOperands(
     const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
 }
 
+// By default, this function is disabled. Overriding target can enable it
+bool TargetLowering::updateBaseAndIndex(const Value *Ptr, SDValue &Base,
+                                        SDValue &Index, const SDLoc &DL,
+                                        const SDValue &Gep, SelectionDAG &DAG,
+                                        const BasicBlock *CurBB) const {
+  return false;
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
                                              StringRef Constraint,

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -126,6 +126,11 @@ static cl::opt<bool> MulConstantOptimization(
              "SHIFT, LEA, etc."),
     cl::Hidden);
 
+static cl::opt<bool>
+    EnableBaseIndexUpdate("update-baseIndex", cl::init(true),
+                          cl::desc("Update the value of base and index"),
+                          cl::Hidden);
+
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -61619,3 +61624,99 @@ Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
     return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
   return TargetLowering::getPrefLoopAlignment();
 }
+
+// Target override this function to decided whether it want to update the base
+// and index value of a non-uniform gep
+bool X86TargetLowering::updateBaseAndIndex(const Value *Ptr, SDValue &Base,
+                                           SDValue &Index, const SDLoc &DL,
+                                           const SDValue &Gep,
+                                           SelectionDAG &DAG,
+                                           const BasicBlock *CurBB) const {
+  if (!EnableBaseIndexUpdate)
+    return false;
+
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (GEP && GEP->getParent() != CurBB)
+    return false;
+
+  SDValue nbase;
+  /* For the gep instruction, we are trying to properly assign the base and
+  index value We are go through the lower code and iterate backward.
+  */
+  if (Gep.getOpcode() == ISD::ADD) {
+    SDValue Op0 = Gep.getOperand(0); // base or  add
+    SDValue Op1 = Gep.getOperand(1); // build vector or SHL
+    nbase = Op0;
+    SDValue Idx = Op1;
+    auto Flags = Gep->getFlags();
+
+    if (Op0->getOpcode() == ISD::ADD) { // add t15(base), t18(Idx)
+      SDValue Op00 = Op0.getOperand(0); // Base
+      nbase = Op00;
+      Idx = Op0.getOperand(1);
+    } else if (!(Op0->getOpcode() == ISD::BUILD_VECTOR &&
+                 Op0.getOperand(0).getOpcode() == ISD::CopyFromReg)) {
+      return false;
+    }
+    SDValue nIndex;
+    if (Idx.getOpcode() == ISD::SHL) {  // shl zext, BV
+      SDValue Op10 = Idx.getOperand(0); // Zext or Sext value
+      SDValue Op11 = Idx.getOperand(1); // Build vector of constant
+
+      unsigned IndexWidth = Op10.getScalarValueSizeInBits();
+      if ((Op10.getOpcode() == ISD::SIGN_EXTEND ||
+           Op10.getOpcode() == ISD::ZERO_EXTEND) &&
+          IndexWidth > 32 &&
+          Op10.getOperand(0).getScalarValueSizeInBits() <= 32 &&
+          DAG.ComputeNumSignBits(Op10) > (IndexWidth - 32) &&
+          Op11.getOpcode() == ISD::BUILD_VECTOR) {
+
+        KnownBits ExtKnown = DAG.computeKnownBits(Op10);
+        bool ExtIsNonNegative = ExtKnown.isNonNegative();
+        KnownBits ExtOpKnown = DAG.computeKnownBits(Op10.getOperand(0));
+        bool ExtOpIsNonNegative = ExtOpKnown.isNonNegative();
+        if (!(ExtIsNonNegative && ExtOpIsNonNegative))
+          return false;
+
+        SDValue newOp10 =
+            Op10.getOperand(0);          // Get the Operand zero from the ext
+        EVT VT = newOp10.getValueType(); // Use the
+
+        auto *ConstEltNo = dyn_cast<ConstantSDNode>(Op11.getOperand(0));
+        if (!ConstEltNo) {
+          return false;
+        }
+        SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(),
+                                    DAG.getConstant(ConstEltNo->getZExtValue(),
+                                                    DL, VT.getScalarType()));
+        nIndex = DAG.getNode(ISD::SHL, DL, VT, newOp10,
+                             DAG.getBuildVector(VT, DL, Ops));
+      } else {
+        return false;
+      }
+    } else {
+      return false;
+    }
+    if (Op0 != nbase) {
+      auto *ConstEltNo = dyn_cast<ConstantSDNode>(Op1.getOperand(0));
+      if (!ConstEltNo) {
+        return false;
+      }
+      SmallVector<SDValue, 8> Ops(
+          nIndex.getValueType().getVectorNumElements(),
+          DAG.getConstant(ConstEltNo->getZExtValue(), DL,
+                          nIndex.getValueType().getScalarType()));
+      nIndex = DAG.getNode(ISD::ADD, DL, nIndex.getValueType(), nIndex,
+                           DAG.getBuildVector(nIndex.getValueType(), DL, Ops),
+                           Flags);
+    }
+    Base = nbase.getOperand(0);
+    Index = nIndex;
+    LLVM_DEBUG(dbgs() << "Successfull in updating the non uniform gep "
+                         "information\n";
+               dbgs() << "updated base "; Base.dump();
+               dbgs() << "updated Index "; Index.dump(););
+    return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1671,6 +1671,13 @@ namespace llvm {
       return TargetLoweringBase::getTypeToTransformTo(Context, VT);
     }
 
+    // Target override this function to decided whether it want to update the
+    // base and index value of a non-uniform gep
+    bool updateBaseAndIndex(const Value *Ptr, SDValue &Base, SDValue &Index,
+                            const SDLoc &DL, const SDValue &Gep,
+                            SelectionDAG &DAG,
+                            const BasicBlock *CurBB) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,

diff --git a/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll b/llvm/test/CodeGen/X86/gatherBaseIndexFix.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s
+; RUN: llc -update-baseIndex -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s
+; RUN: llc -update-baseIndex=false -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -mcpu=znver5 < %s | FileCheck %s -check-prefix=OLD
+
+%struct.pt = type { float, float, float, i32 }
+
+; CHECK-LABEL: test_gather_16f32_1:
+; CHECK:   vgatherdps
+
+; OLD-LABEL: test_gather_16f32_1:
+; OLD:  vgatherqps
+; OLD:  vgatherqps
+
+define <16 x float> @test_gather_16f32_1(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0)  {
+  %wide.load = load <16 x i32>, ptr %arr, align 4
+  %4 = and <16 x i32> %wide.load, <i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911>
+  %5 = zext <16 x i32> %4 to <16 x i64>
+  %ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+  ret <16 x float> %res
+  }
+
+; CHECK-LABEL: test_gather_16f32_2:
+; CHECK:   vgatherdps
+
+; OLD-LABEL: test_gather_16f32_2:
+; OLD:  vgatherqps
+; OLD:  vgatherqps
+
+define <16 x float> @test_gather_16f32_2(ptr %x, ptr %arr, <16 x i1> %mask, <16 x float> %src0)  {
+  %wide.load = load <16 x i32>, ptr %arr, align 4
+  %4 = and <16 x i32> %wide.load, <i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911, i32 536870911>
+  %5 = zext <16 x i32> %4 to <16 x i64>
+  %ptrs = getelementptr inbounds %struct.pt, ptr %x, <16 x i64> %5, i32 1
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+  ret <16 x float> %res
+  }