Skip to content

[NVPTX] Improve folding to mad with immediate 1 #93628

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 92 additions & 6 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5614,17 +5614,103 @@ static SDValue TryMULWIDECombine(SDNode *N,
return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
}

static bool isConstOne(const SDValue &Operand) {
const auto *Const = dyn_cast<ConstantSDNode>(Operand);
return Const && Const->getZExtValue() == 1;
}

static SDValue matchMADConstOnePattern(SDValue Add) {
if (Add->getOpcode() != ISD::ADD)
return SDValue();

if (isConstOne(Add->getOperand(0)))
return Add->getOperand(1);

if (isConstOne(Add->getOperand(1)))
return Add->getOperand(0);

return SDValue();
}

static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL,
TargetLowering::DAGCombinerInfo &DCI) {

if (SDValue Y = matchMADConstOnePattern(Add))
return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X);

return SDValue();
}

static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT,
SDLoc DL,
TargetLowering::DAGCombinerInfo &DCI) {
if (Select->getOpcode() != ISD::SELECT)
return SDValue();

SDValue Cond = Select->getOperand(0);

unsigned ConstOpNo;
if (isConstOne(Select->getOperand(1)))
ConstOpNo = 1;
else if (isConstOne(Select->getOperand(2)))
ConstOpNo = 2;
else
return SDValue();

SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);

// Do not combine if the resulting sequence is not obviously profitable.
if (!matchMADConstOnePattern(Y))
return SDValue();

SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);

return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
(ConstOpNo == 1) ? X : NewMul,
(ConstOpNo == 1) ? NewMul : X);
}

static SDValue
PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
TargetLowering::DAGCombinerInfo &DCI) {

EVT VT = N0.getValueType();
if (VT.isVector())
return SDValue();

if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();

SDLoc DL(N);

// (mul x, (add y, 1)) -> (mad x, y, x)
if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
return Res;
if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
return Res;

// (mul x, (select y, 1)) -> (select (mul x, y), x)
if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
return Res;
if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
return Res;

return SDValue();
}

/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
static SDValue PerformMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
CodeGenOptLevel OptLevel) {
if (OptLevel > CodeGenOptLevel::None) {
// Try mul.wide combining at OptLevel > 0
if (SDValue Ret = TryMULWIDECombine(N, DCI))
return Ret;
}
if (OptLevel == CodeGenOptLevel::None)
return SDValue();

return SDValue();
if (SDValue Ret = TryMULWIDECombine(N, DCI))
return Ret;

SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
return PerformMULCombineWithOperands(N, N0, N1, DCI);
}

/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
Expand Down
136 changes: 136 additions & 0 deletions llvm/test/CodeGen/NVPTX/combine-mad.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | FileCheck %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -O1 | %ptxas-verify %}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be disabled with newer ptxas.

RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %}

; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -O1 | %ptxas-verify %}

define i32 @test1(i32 %n, i32 %m) {
;
; CHECK-LABEL: test1(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test1_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test1_param_1];
; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
; CHECK-NEXT: ret;
%add = add i32 %n, 1
%mul = mul i32 %add, %m
ret i32 %mul
}

define i32 @test1_rev(i32 %n, i32 %m) {
;
; CHECK-LABEL: test1_rev(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test1_rev_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test1_rev_param_1];
; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
; CHECK-NEXT: ret;
%add = add i32 %n, 1
%mul = mul i32 %m, %add
ret i32 %mul
}

; Transpose (mul (select)) if it can then be folded to mad
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it buy us anything?

mul(m,select(1,n)) will probably have the same performance as select(mul(m,n), m) as the critical path will always have mul and select, just in different order.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By itself this transform doesn't help much, I agree. However, if m or n are add(x,1) then it enables the other transformation. In the code we're checking for this case and only running the transformation when it would enable further folding. A rare case to be sure, but better to support it than not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This kind of optimization is not target-specific and should probably be done somewhere in instcombine. Perhaps move the optimization of mul(m,select(1,n)) there as a separate patch?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instcombine already canonicalizes in the opposite direction, select(mul(m,n), m) -> mul(m,select(1,n)). I think this is target specific because it is only worth doing to improve mad folding.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK.

define i32 @test2(i32 %n, i32 %m, i32 %s) {
;
; CHECK-LABEL: test2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test2_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test2_param_1];
; CHECK-NEXT: ld.param.u32 %r3, [test2_param_2];
; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1;
; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2;
; CHECK-NEXT: selp.b32 %r5, %r2, %r4, %p1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
; CHECK-NEXT: ret;
%add = add i32 %n, 1
%cond = icmp slt i32 %s, 1
%sel = select i1 %cond, i32 1, i32 %add
%mul = mul i32 %sel, %m
ret i32 %mul
}

;; Transpose (mul (select)) if it can then be folded to mad
define i32 @test2_rev1(i32 %n, i32 %m, i32 %s) {
;
; CHECK-LABEL: test2_rev1(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test2_rev1_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test2_rev1_param_1];
; CHECK-NEXT: ld.param.u32 %r3, [test2_rev1_param_2];
; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1;
; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2;
; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
; CHECK-NEXT: ret;
%add = add i32 %n, 1
%cond = icmp slt i32 %s, 1
%sel = select i1 %cond, i32 %add, i32 1
%mul = mul i32 %sel, %m
ret i32 %mul
}

;; Transpose (mul (select)) if it can then be folded to mad
define i32 @test2_rev2(i32 %n, i32 %m, i32 %s) {
;
; CHECK-LABEL: test2_rev2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test2_rev2_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test2_rev2_param_1];
; CHECK-NEXT: ld.param.u32 %r3, [test2_rev2_param_2];
; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1;
; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2;
; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
; CHECK-NEXT: ret;
%add = add i32 %n, 1
%cond = icmp slt i32 %s, 1
%sel = select i1 %cond, i32 %add, i32 1
%mul = mul i32 %m, %sel
ret i32 %mul
}

;; Leave (mul (select)) intact if it transposing is not profitable
define i32 @test3(i32 %n, i32 %m, i32 %s) {
;
; CHECK-LABEL: test3(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test3_param_0];
; CHECK-NEXT: add.s32 %r2, %r1, 3;
; CHECK-NEXT: ld.param.u32 %r3, [test3_param_1];
; CHECK-NEXT: ld.param.u32 %r4, [test3_param_2];
; CHECK-NEXT: setp.lt.s32 %p1, %r4, 1;
; CHECK-NEXT: selp.b32 %r5, 1, %r2, %p1;
; CHECK-NEXT: mul.lo.s32 %r6, %r5, %r3;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6;
; CHECK-NEXT: ret;
%add = add i32 %n, 3
%cond = icmp slt i32 %s, 1
%sel = select i1 %cond, i32 1, i32 %add
%mul = mul i32 %sel, %m
ret i32 %mul
}
Loading