Skip to content

Commit 7c58d63

Browse files
authored
[AMDGPU] Add commute for some VOP3 inst (llvm#121326)
add commute for some VOP3 inst, allow commute for both inline constant operand, adjust tests Fixes llvm#111205
1 parent 8c2030b commit 7c58d63

11 files changed

+279
-98
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 111 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2749,6 +2749,63 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
27492749
return &MI;
27502750
}
27512751

2752+
static MachineInstr *swapImmOperands(MachineInstr &MI,
2753+
MachineOperand &NonRegOp1,
2754+
MachineOperand &NonRegOp2) {
2755+
unsigned TargetFlags = NonRegOp1.getTargetFlags();
2756+
int64_t NonRegVal = NonRegOp1.getImm();
2757+
2758+
NonRegOp1.setImm(NonRegOp2.getImm());
2759+
NonRegOp2.setImm(NonRegVal);
2760+
NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2761+
NonRegOp2.setTargetFlags(TargetFlags);
2762+
return &MI;
2763+
}
2764+
2765+
bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2766+
const MachineOperand *MO0, unsigned OpIdx1,
2767+
const MachineOperand *MO1) const {
2768+
const MCInstrDesc &InstDesc = MI.getDesc();
2769+
const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2770+
const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2771+
const TargetRegisterClass *DefinedRC1 =
2772+
OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
2773+
const TargetRegisterClass *DefinedRC0 =
2774+
OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
2775+
2776+
unsigned Opc = MI.getOpcode();
2777+
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2778+
2779+
// Swap doesn't breach constant bus or literal limits
2780+
// It may move literal to position other than src0, this is not allowed
2781+
// pre-gfx10 However, most test cases need literals in Src0 for VOP
2782+
// FIXME: After gfx9, literal can be in place other than Src0
2783+
if (isVALU(MI)) {
2784+
if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
2785+
!isInlineConstant(*MO0, OpInfo1))
2786+
return false;
2787+
if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
2788+
!isInlineConstant(*MO1, OpInfo0))
2789+
return false;
2790+
}
2791+
2792+
if (OpIdx1 != Src0Idx && MO0->isReg()) {
2793+
if (!DefinedRC1)
2794+
return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2795+
return isLegalRegOperand(MI, OpIdx1, *MO0);
2796+
}
2797+
if (OpIdx0 != Src0Idx && MO1->isReg()) {
2798+
if (!DefinedRC0)
2799+
return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2800+
return isLegalRegOperand(MI, OpIdx0, *MO1);
2801+
}
2802+
2803+
// No need to check 64-bit literals since swapping does not bring new
2804+
// 64-bit literals into current instruction to fold to 32-bit
2805+
2806+
return isImmOperandLegal(MI, OpIdx1, *MO0);
2807+
}
2808+
27522809
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
27532810
unsigned Src0Idx,
27542811
unsigned Src1Idx) const {
@@ -2770,21 +2827,20 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
27702827

27712828
MachineOperand &Src0 = MI.getOperand(Src0Idx);
27722829
MachineOperand &Src1 = MI.getOperand(Src1Idx);
2773-
2830+
if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
2831+
return nullptr;
2832+
}
27742833
MachineInstr *CommutedMI = nullptr;
27752834
if (Src0.isReg() && Src1.isReg()) {
2776-
if (isOperandLegal(MI, Src1Idx, &Src0)) {
2777-
// Be sure to copy the source modifiers to the right place.
2778-
CommutedMI
2779-
= TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2780-
}
2781-
2835+
// Be sure to copy the source modifiers to the right place.
2836+
CommutedMI =
2837+
TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
27822838
} else if (Src0.isReg() && !Src1.isReg()) {
2783-
if (isOperandLegal(MI, Src1Idx, &Src0))
2784-
CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2839+
CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
27852840
} else if (!Src0.isReg() && Src1.isReg()) {
2786-
if (isOperandLegal(MI, Src1Idx, &Src0))
2787-
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2841+
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2842+
} else if (Src0.isImm() && Src1.isImm()) {
2843+
CommutedMI = swapImmOperands(MI, Src0, Src1);
27882844
} else {
27892845
// FIXME: Found two non registers to commute. This does happen.
27902846
return nullptr;
@@ -5817,6 +5873,49 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
58175873
return RC->hasSuperClassEq(DRC);
58185874
}
58195875

5876+
bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
5877+
const MachineOperand &MO) const {
5878+
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5879+
const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
5880+
unsigned Opc = MI.getOpcode();
5881+
5882+
if (!isLegalRegOperand(MRI, OpInfo, MO))
5883+
return false;
5884+
5885+
// check Accumulate GPR operand
5886+
bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
5887+
if (IsAGPR && !ST.hasMAIInsts())
5888+
return false;
5889+
if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5890+
(MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5891+
return false;
5892+
// Atomics should have both vdst and vdata either vgpr or agpr.
5893+
const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5894+
const int DataIdx = AMDGPU::getNamedOperandIdx(
5895+
Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5896+
if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5897+
MI.getOperand(DataIdx).isReg() &&
5898+
RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5899+
return false;
5900+
if ((int)OpIdx == DataIdx) {
5901+
if (VDstIdx != -1 &&
5902+
RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5903+
return false;
5904+
// DS instructions with 2 src operands also must have tied RC.
5905+
const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
5906+
if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5907+
RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5908+
return false;
5909+
}
5910+
5911+
// Check V_ACCVGPR_WRITE_B32_e64
5912+
if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5913+
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5914+
RI.isSGPRReg(MRI, MO.getReg()))
5915+
return false;
5916+
return true;
5917+
}
5918+
58205919
bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
58215920
const MCOperandInfo &OpInfo,
58225921
const MachineOperand &MO) const {
@@ -5879,40 +5978,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
58795978
if (MO->isReg()) {
58805979
if (!DefinedRC)
58815980
return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5882-
if (!isLegalRegOperand(MRI, OpInfo, *MO))
5883-
return false;
5884-
bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5885-
if (IsAGPR && !ST.hasMAIInsts())
5886-
return false;
5887-
unsigned Opc = MI.getOpcode();
5888-
if (IsAGPR &&
5889-
(!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5890-
(MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5891-
return false;
5892-
// Atomics should have both vdst and vdata either vgpr or agpr.
5893-
const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5894-
const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5895-
isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5896-
if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5897-
MI.getOperand(DataIdx).isReg() &&
5898-
RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5899-
return false;
5900-
if ((int)OpIdx == DataIdx) {
5901-
if (VDstIdx != -1 &&
5902-
RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5903-
return false;
5904-
// DS instructions with 2 src operands also must have tied RC.
5905-
const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5906-
AMDGPU::OpName::data1);
5907-
if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5908-
RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5909-
return false;
5910-
}
5911-
if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5912-
(int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5913-
RI.isSGPRReg(MRI, MO->getReg()))
5914-
return false;
5915-
return true;
5981+
return isLegalRegOperand(MI, OpIdx, *MO);
59165982
}
59175983

59185984
if (MO->isImm()) {

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
193193
bool swapSourceModifiers(MachineInstr &MI,
194194
MachineOperand &Src0, unsigned Src0OpName,
195195
MachineOperand &Src1, unsigned Src1OpName) const;
196-
196+
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx,
197+
const MachineOperand *fromMO, unsigned toIdx,
198+
const MachineOperand *toMO) const;
197199
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
198200
unsigned OpIdx0,
199201
unsigned OpIdx1) const override;
@@ -1218,11 +1220,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
12181220
const MachineOperand &MO) const;
12191221

12201222
/// Check if \p MO (a register operand) is a legal register for the
1221-
/// given operand description.
1223+
/// given operand description or operand index.
1224+
/// The operand index version provide more legality checks
12221225
bool isLegalRegOperand(const MachineRegisterInfo &MRI,
12231226
const MCOperandInfo &OpInfo,
12241227
const MachineOperand &MO) const;
1225-
1228+
bool isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
1229+
const MachineOperand &MO) const;
12261230
/// Legalize operands in \p MI by either commuting it or inserting a
12271231
/// copy of src1.
12281232
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,9 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
335335
let FPDPRounding = 1 in {
336336
let Predicates = [Has16BitInsts, isGFX8Only] in {
337337
defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
338-
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
338+
let isCommutable = 1 in {
339+
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
340+
} // End isCommutable = 1
339341
} // End Predicates = [Has16BitInsts, isGFX8Only]
340342

341343
let SubtargetPredicate = isGFX9Plus in {
@@ -639,8 +641,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
639641
defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
640642
defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
641643

642-
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
643-
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
644+
let isCommutable = 1 in {
645+
defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
646+
defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
647+
} // End isCommutable = 1
644648

645649
defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
646650
defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
@@ -1254,8 +1258,9 @@ let SubtargetPredicate = isGFX10Plus in {
12541258
def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
12551259
def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
12561260
}
1257-
1258-
defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
1261+
let isCommutable = 1 in {
1262+
defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
1263+
} // End isCommutable = 1
12591264
defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;
12601265

12611266
} // End SubtargetPredicate = isGFX10Plus

llvm/test/CodeGen/AMDGPU/carryout-selection.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
355355
; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
356356
; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
357357
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
358-
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
358+
; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
359359
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
360360
; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
361361
; GFX1010-NEXT: s_endpgm
@@ -365,7 +365,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
365365
; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
366366
; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
367367
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
368-
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
368+
; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
369369
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
370370
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
371371
; GFX1030W32-NEXT: s_endpgm
@@ -375,7 +375,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
375375
; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
376376
; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
377377
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
378-
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
378+
; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
379379
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
380380
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
381381
; GFX1030W64-NEXT: s_endpgm
@@ -387,7 +387,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
387387
; GFX11-NEXT: v_mov_b32_e32 v2, 0
388388
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
389389
; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
390-
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
390+
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2
391391
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
392392
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
393393
; GFX11-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/cmp_shrink.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ name: not_shrink_icmp
77
body: |
88
bb.0:
99
; GCN-LABEL: name: not_shrink_icmp
10-
; GCN: S_CMP_GT_I32 1, 65, implicit-def $scc
10+
; GCN: S_CMP_LT_I32 65, 1, implicit-def $scc
1111
S_CMP_GT_I32 1, 65, implicit-def $scc
1212
...

0 commit comments

Comments
 (0)