Skip to content

[AMDGPU] Add commute for some VOP3 inst #121326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
f0877bb
add commute for some VOP3 inst, allow commute for both inline constan…
Shoreshen Dec 30, 2024
ae5f1e8
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Dec 31, 2024
df6903c
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Dec 31, 2024
ada83d6
add inline constant case & merge main
Shoreshen Dec 31, 2024
c599af0
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 2, 2025
3a83ae5
fix lit change
Shoreshen Jan 2, 2025
5c9d065
fix lit change
Shoreshen Jan 2, 2025
d3f00dc
Update llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Shoreshen Jan 2, 2025
0e60298
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 2, 2025
707474f
fix comments
Shoreshen Jan 2, 2025
79219d1
fix
Shoreshen Jan 2, 2025
288ead2
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 4, 2025
4becc58
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 5, 2025
53b370a
Add legal check for swap
Shoreshen Jan 6, 2025
0b746a8
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 6, 2025
785921f
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 7, 2025
6955a86
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 8, 2025
5e15e72
add tests & merge_main
Shoreshen Jan 8, 2025
004a82d
fix lit.cfg.py
Shoreshen Jan 8, 2025
dc2739f
fix lit.cfg.py
Shoreshen Jan 8, 2025
378c02c
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 8, 2025
49ae569
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 9, 2025
161a2b9
adjust comment & merge main
Shoreshen Jan 9, 2025
4d569ae
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 10, 2025
1689c1e
adjust case & merge main
Shoreshen Jan 10, 2025
328e566
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 13, 2025
9faf423
fix inconsistent capitalization
Shoreshen Jan 13, 2025
19b8ad4
fix test case
Shoreshen Jan 13, 2025
cc3a125
Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…
Shoreshen Jan 20, 2025
0a89dc9
merge main
Shoreshen Jan 20, 2025
d8e6cb7
fix format
Shoreshen Jan 20, 2025
3c7bd89
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 21, 2025
bf1da57
remove special handling for VOPD, since no commutable VOPD instruction
Shoreshen Jan 21, 2025
789092d
Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…
Shoreshen Jan 21, 2025
6c35280
Merge branch 'main' into Add-isCommutable-attribute-to-VOP3-instructions
Shoreshen Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 111 additions & 45 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2749,6 +2749,63 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
return &MI;
}

static MachineInstr *swapImmOperands(MachineInstr &MI,
MachineOperand &NonRegOp1,
MachineOperand &NonRegOp2) {
unsigned TargetFlags = NonRegOp1.getTargetFlags();
int64_t NonRegVal = NonRegOp1.getImm();

NonRegOp1.setImm(NonRegOp2.getImm());
NonRegOp2.setImm(NonRegVal);
NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
NonRegOp2.setTargetFlags(TargetFlags);
return &MI;
}

bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
const MachineOperand *MO0, unsigned OpIdx1,
const MachineOperand *MO1) const {
const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
const TargetRegisterClass *DefinedRC1 =
OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
const TargetRegisterClass *DefinedRC0 =
OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;

unsigned Opc = MI.getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);

// Swap doesn't breach constant bus or literal limits
// It may move literal to position other than src0, this is not allowed
// pre-gfx10 However, most test cases need literals in Src0 for VOP
// FIXME: After gfx9, literal can be in place other than Src0
if (isVALU(MI)) {
if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
!isInlineConstant(*MO0, OpInfo1))
return false;
if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
!isInlineConstant(*MO1, OpInfo0))
return false;
}

if (OpIdx1 != Src0Idx && MO0->isReg()) {
if (!DefinedRC1)
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
return isLegalRegOperand(MI, OpIdx1, *MO0);
}
if (OpIdx0 != Src0Idx && MO1->isReg()) {
if (!DefinedRC0)
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
return isLegalRegOperand(MI, OpIdx0, *MO1);
}

// No need to check 64-bit literals since swapping does not bring new
// 64-bit literals into current instruction to fold to 32-bit

return isImmOperandLegal(MI, OpIdx1, *MO0);
}

MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned Src0Idx,
unsigned Src1Idx) const {
Expand All @@ -2770,21 +2827,20 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,

MachineOperand &Src0 = MI.getOperand(Src0Idx);
MachineOperand &Src1 = MI.getOperand(Src1Idx);

if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
return nullptr;
}
MachineInstr *CommutedMI = nullptr;
if (Src0.isReg() && Src1.isReg()) {
if (isOperandLegal(MI, Src1Idx, &Src0)) {
// Be sure to copy the source modifiers to the right place.
CommutedMI
= TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
}

// Be sure to copy the source modifiers to the right place.
CommutedMI =
TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
} else if (Src0.isReg() && !Src1.isReg()) {
if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
} else if (!Src0.isReg() && Src1.isReg()) {
if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
} else if (Src0.isImm() && Src1.isImm()) {
CommutedMI = swapImmOperands(MI, Src0, Src1);
} else {
// FIXME: Found two non registers to commute. This does happen.
return nullptr;
Expand Down Expand Up @@ -5817,6 +5873,49 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
return RC->hasSuperClassEq(DRC);
}

bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
unsigned Opc = MI.getOpcode();

if (!isLegalRegOperand(MRI, OpInfo, MO))
return false;

// check Accumulate GPR operand
bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
if (IsAGPR && !ST.hasMAIInsts())
return false;
if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This !MRI.reservedRegsFrozen() doesn't make any sense but all of this code looks copied directly from existing code

Copy link
Contributor Author

@Shoreshen Shoreshen Jan 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I just wrap it from the isoperandlegal function to avoid duplicate code. Should I try to remove it??

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not here

(MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
return false;
// Atomics should have both vdst and vdata either vgpr or agpr.
const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
const int DataIdx = AMDGPU::getNamedOperandIdx(
Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
MI.getOperand(DataIdx).isReg() &&
RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
return false;
if ((int)OpIdx == DataIdx) {
if (VDstIdx != -1 &&
RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
return false;
// DS instructions with 2 src operands also must have tied RC.
const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
return false;
}

// Check V_ACCVGPR_WRITE_B32_e64
if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
RI.isSGPRReg(MRI, MO.getReg()))
return false;
return true;
}

bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const {
Expand Down Expand Up @@ -5879,40 +5978,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
if (MO->isReg()) {
if (!DefinedRC)
return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
if (!isLegalRegOperand(MRI, OpInfo, *MO))
return false;
bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
if (IsAGPR && !ST.hasMAIInsts())
return false;
unsigned Opc = MI.getOpcode();
if (IsAGPR &&
(!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
(MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
return false;
// Atomics should have both vdst and vdata either vgpr or agpr.
const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
MI.getOperand(DataIdx).isReg() &&
RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
return false;
if ((int)OpIdx == DataIdx) {
if (VDstIdx != -1 &&
RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
return false;
// DS instructions with 2 src operands also must have tied RC.
const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
AMDGPU::OpName::data1);
if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
return false;
}
if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
RI.isSGPRReg(MRI, MO->getReg()))
return false;
return true;
return isLegalRegOperand(MI, OpIdx, *MO);
}

if (MO->isImm()) {
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0, unsigned Src0OpName,
MachineOperand &Src1, unsigned Src1OpName) const;

bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
const MachineOperand *fromMO, unsigned toIdx,
const MachineOperand *toMO) const;
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx0,
unsigned OpIdx1) const override;
Expand Down Expand Up @@ -1218,11 +1220,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
const MachineOperand &MO) const;

/// Check if \p MO (a register operand) is a legal register for the
/// given operand description.
/// given operand description or operand index.
/// The operand index version provide more legality checks
bool isLegalRegOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,
const MachineOperand &MO) const;

bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand &MO) const;
/// Legalize operands in \p MI by either commuting it or inserting a
/// copy of src1.
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;
Expand Down
15 changes: 10 additions & 5 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,9 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
let FPDPRounding = 1 in {
let Predicates = [Has16BitInsts, isGFX8Only] in {
defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
let isCommutable = 1 in {
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
} // End isCommutable = 1
} // End Predicates = [Has16BitInsts, isGFX8Only]

let SubtargetPredicate = isGFX9Plus in {
Expand Down Expand Up @@ -639,8 +641,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;

defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
let isCommutable = 1 in {
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
} // End isCommutable = 1

defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
Expand Down Expand Up @@ -1254,8 +1258,9 @@ let SubtargetPredicate = isGFX10Plus in {
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
}

defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
let isCommutable = 1 in {
defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
} // End isCommutable = 1
defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;

} // End SubtargetPredicate = isGFX10Plus
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/carryout-selection.ll
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1010-NEXT: s_endpgm
Expand All @@ -365,7 +365,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W32-NEXT: s_endpgm
Expand All @@ -375,7 +375,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX1030W64-NEXT: s_endpgm
Expand All @@ -387,7 +387,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/cmp_shrink.mir
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ name: not_shrink_icmp
body: |
bb.0:
; GCN-LABEL: name: not_shrink_icmp
; GCN: S_CMP_GT_I32 1, 65, implicit-def $scc
; GCN: S_CMP_LT_I32 65, 1, implicit-def $scc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pointless swap but probably not this patch's problem

S_CMP_GT_I32 1, 65, implicit-def $scc
...
Loading
Loading