Skip to content

Commit 8a66510

Browse files
authored
[AMDGPU] Don't create mulhi_24 in CGP (llvm#72983)
Instead, create a mul24 with a 64 bit result and let ISel take care of it. This allows patterns to simply match mul24 even for 64-bit muls instead of having to match both mul/mulhi and a buildvector/bitconvert/etc.
1 parent c0b9269 commit 8a66510

8 files changed

+123
-243
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1998,12 +1998,14 @@ def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
19981998
[IntrNoMem, IntrSpeculatable]
19991999
>;
20002000

2001-
def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
2001+
// mul24 intrinsics can return i32 or i64.
2002+
// When returning i64, they're lowered to a mul24/mulhi24 pair.
2003+
def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
20022004
[llvm_i32_ty, llvm_i32_ty],
20032005
[IntrNoMem, IntrSpeculatable]
20042006
>;
20052007

2006-
def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
2008+
def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
20072009
[llvm_i32_ty, llvm_i32_ty],
20082010
[IntrNoMem, IntrSpeculatable]
20092011
>;

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 13 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -624,34 +624,6 @@ static Value *insertValues(IRBuilder<> &Builder,
624624
return NewVal;
625625
}
626626

627-
// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
628-
// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
629-
// width of the original destination.
630-
static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
631-
unsigned Size, unsigned NumBits, bool IsSigned) {
632-
if (Size <= 32 || NumBits <= 32) {
633-
Intrinsic::ID ID =
634-
IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
635-
return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
636-
}
637-
638-
assert(NumBits <= 48);
639-
640-
Intrinsic::ID LoID =
641-
IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
642-
Intrinsic::ID HiID =
643-
IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
644-
645-
Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
646-
Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
647-
648-
IntegerType *I64Ty = Builder.getInt64Ty();
649-
Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
650-
Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
651-
652-
return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
653-
}
654-
655627
bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
656628
if (I.getOpcode() != Instruction::Mul)
657629
return false;
@@ -691,26 +663,20 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
691663
extractValues(Builder, RHSVals, RHS);
692664

693665
IntegerType *I32Ty = Builder.getInt32Ty();
694-
for (int I = 0, E = LHSVals.size(); I != E; ++I) {
695-
Value *LHS, *RHS;
696-
if (IsSigned) {
697-
LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
698-
RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
699-
} else {
700-
LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
701-
RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
702-
}
666+
IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
667+
Type *DstTy = LHSVals[0]->getType();
703668

704-
Value *Result =
705-
getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
706-
707-
if (IsSigned) {
708-
ResultVals.push_back(
709-
Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
710-
} else {
711-
ResultVals.push_back(
712-
Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
713-
}
669+
for (int I = 0, E = LHSVals.size(); I != E; ++I) {
670+
Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
671+
: Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
672+
Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
673+
: Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
674+
Intrinsic::ID ID =
675+
IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
676+
Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
677+
Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
678+
: Builder.CreateZExtOrTrunc(Result, DstTy);
679+
ResultVals.push_back(Result);
714680
}
715681

716682
Value *NewVal = insertValues(Builder, Ty, ResultVals);

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,15 @@ def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
281281
def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
282282

283283
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
284-
// when performing the multiply. The result is a 32-bit value.
285-
def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
284+
// when performing the multiply. The result is a 32 or 64 bit value.
285+
def AMDGPUMul24Op : SDTypeProfile<1, 2, [
286+
SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
287+
]>;
288+
289+
def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", AMDGPUMul24Op,
286290
[SDNPCommutative, SDNPAssociative]
287291
>;
288-
def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
292+
def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", AMDGPUMul24Op,
289293
[SDNPCommutative, SDNPAssociative]
290294
>;
291295

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,17 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
862862
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
863863
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
864864

865+
// mul24 w/ 64 bit output.
866+
class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
867+
(i64 (Op i32:$src0, i32:$src1)),
868+
(REG_SEQUENCE VReg_64,
869+
(InstLo $src0, $src1), sub0,
870+
(InstHi $src0, $src1), sub1)
871+
>;
872+
873+
def : mul24_64_Pat<AMDGPUmul_i24, V_MUL_I32_I24_e64, V_MUL_HI_I32_I24_e64>;
874+
def : mul24_64_Pat<AMDGPUmul_u24, V_MUL_U32_U24_e64, V_MUL_HI_U32_U24_e64>;
875+
865876
//===----------------------------------------------------------------------===//
866877
// 16-Bit Operand Instructions
867878
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)