[AMDGPU] Add commute for some VOP3 inst #121326

Shoreshen · 2024-12-30T08:44:06Z

add commute for some VOP3 inst, allow commute for both inline constant operand, adjust tests

…t operand, adjust tests

github-actions · 2024-12-30T08:44:26Z

Thank you for submitting a Pull Request (PR) to the LLVM Project!

This PR will be automatically labeled and the relevant teams will be notified.

If you wish to, you can add reviewers by using the "Reviewers" section on this page.

If this is not working for you, it is probably because you do not have write permissions for the repository. In which case you can instead tag reviewers by name in a comment by using @ followed by their GitHub username.

If you have received no comments on your PR for a week, you can request a review by "ping"ing the PR by adding a comment “Ping”. The common courtesy "ping" rate is once a week. Please remember that you are asking for valuable time from other developers.

If you have further questions, they may be answered by the LLVM GitHub User Guide.

You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums.

llvmbot · 2024-12-30T08:45:00Z

@llvm/pr-subscribers-backend-amdgpu

Author: None (Shoreshen)

Changes

add commute for some VOP3 inst, allow commute for both inline constant operand, adjust tests

Referring #111205

Full diff: https://github.com/llvm/llvm-project/pull/121326.diff

9 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+17)
(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+6-3)
(modified) llvm/test/CodeGen/AMDGPU/commute-op-sel.mir (+1-2)
(modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+2-2)
(modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+4-4)
(modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+16-16)
(modified) llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (+8-8)
(modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+1-1)
(modified) llvm/test/lit.cfg.py (+1-1)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f97ea40caa6704..a7a384a3dbf3d3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2749,6 +2749,20 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
   return &MI;
 }
 
+static MachineInstr *swapInlineConstOperands(MachineInstr &MI,
+                                             MachineOperand &NonRegOp1,
+                                             MachineOperand &NonRegOp2) {
+
+  auto TargetFlags = NonRegOp1.getTargetFlags();
+  auto NonRegVal = NonRegOp1.getImm();
+
+  NonRegOp1.setImm(NonRegOp2.getImm());
+  NonRegOp2.setImm(NonRegVal);
+  NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
+  NonRegOp2.setTargetFlags(TargetFlags);
+  return &MI;
+}
+
 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                   unsigned Src0Idx,
                                                   unsigned Src1Idx) const {
@@ -2785,6 +2799,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   } else if (!Src0.isReg() && Src1.isReg()) {
     if (isOperandLegal(MI, Src1Idx, &Src0))
       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
+  } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) {
+    if (isOperandLegal(MI, Src1Idx, &Src0))
+      CommutedMI = swapInlineConstOperands(MI, Src0, Src1);
   } else {
     // FIXME: Found two non registers to commute. This does happen.
     return nullptr;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 22e457674c07a1..a01fcf308c83ba 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -335,6 +335,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
 let FPDPRounding = 1 in {
   let Predicates = [Has16BitInsts, isGFX8Only] in {
     defm V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
+    let isCommutable = 1 in
     defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
   } // End Predicates = [Has16BitInsts, isGFX8Only]
 
@@ -639,8 +640,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
 defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
 defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
 
-defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
-defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+let isCommutable = 1 in {
+  defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+  defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
+} // End isCommutable = 1
 
 defm V_CVT_PKNORM_I16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_i16_f16", VOP_B32_F16_F16>;
 defm V_CVT_PKNORM_U16_F16 : VOP3Inst_t16 <"v_cvt_pknorm_u16_f16", VOP_B32_F16_F16>;
@@ -1254,7 +1257,7 @@ let SubtargetPredicate = isGFX10Plus in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
     def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
   }
-
+  let isCommutable = 1 in
   defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
   defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
index b9397f9d5d4ddc..01595ce04313ca 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-op-sel.mir
@@ -2,8 +2,7 @@
 
 # GCN-LABEL: name: test_machine_cse_op_sel
 # GCN: %2:vgpr_32 = V_ADD_NC_U16_e64 0, %0, 0, %1, 1, 0, implicit $mode, implicit $exec
-# GCN: %3:vgpr_32 = V_ADD_NC_U16_e64 0, %1, 0, %0, 1, 0, implicit $mode, implicit $exec
-# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %3, 0, 1, 0, implicit $exec
+# GCN: DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %2, %2, 0, 1, 0, implicit $exec
 ---
 name: test_machine_cse_op_sel
 body: |
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 3019d4d298eb45..b4d450a90d5950 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffe8
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffe8, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
@@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffe7
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffe7, v1
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index b897e1feed5d56..fec020a296b9b4 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1657,8 +1657,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, 0x900
-; GFX10-NEXT:    v_add_nc_u16 v5, v2, 0x900
+; GFX10-NEXT:    v_add_nc_u16 v1, 0x900, v1
+; GFX10-NEXT:    v_add_nc_u16 v5, 0x900, v2
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
@@ -1723,10 +1723,10 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 0x900
+; GFX11-NEXT:    v_add_nc_u16 v2, 0x900, v2
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v1, v1, 0x900
+; GFX11-NEXT:    v_add_nc_u16 v1, 0x900, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v2
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index f416131e3d3140..480d978fa530b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -397,7 +397,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -408,7 +408,7 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -462,7 +462,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f00, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -473,7 +473,7 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f00, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1348,7 +1348,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, 0xff80, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1357,7 +1357,7 @@ define i1 @isnormal_bf16(bfloat %x) {
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, 0xff80, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1402,7 +1402,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX10CHECK:       ; %bb.0:
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, 0xff80, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1411,7 +1411,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
 ; GFX11CHECK:       ; %bb.0:
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, 0xff80, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1464,7 +1464,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v1
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -1475,7 +1475,7 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1529,7 +1529,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX10CHECK-NEXT:    v_cmp_lt_u16_e64 s4, 0x7eff, v1
 ; GFX10CHECK-NEXT:    s_or_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -1540,7 +1540,7 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, -1, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v1, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v1, 0xff80, v1
 ; GFX11CHECK-NEXT:    v_cmp_lt_u16_e64 s0, 0x7eff, v1
 ; GFX11CHECK-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -2569,7 +2569,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s4, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e64 s5, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v0, 0xff80, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s6, 0x7f, v1
 ; GFX10CHECK-NEXT:    s_and_b32 s4, s4, vcc_lo
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -2587,7 +2587,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s0, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e64 s1, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_add_nc_u16 v0, v0, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v0, 0xff80, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s2, 0x7f, v1
 ; GFX11CHECK-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0
@@ -2669,7 +2669,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
-; GFX10CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
+; GFX10CHECK-NEXT:    v_add_nc_u16 v2, 0xff80, v0
 ; GFX10CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX10CHECK-NEXT:    v_cmp_lt_i16_e64 s5, 0x7fbf, v0
 ; GFX10CHECK-NEXT:    v_cmp_gt_u16_e64 s4, 0x7f, v1
@@ -2685,7 +2685,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
 ; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX11CHECK-NEXT:    v_add_nc_u16 v1, v0, -1
-; GFX11CHECK-NEXT:    v_add_nc_u16 v2, v0, 0xff80
+; GFX11CHECK-NEXT:    v_add_nc_u16 v2, 0xff80, v0
 ; GFX11CHECK-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0
 ; GFX11CHECK-NEXT:    v_cmp_lt_i16_e64 s1, 0x7fbf, v0
 ; GFX11CHECK-NEXT:    v_cmp_gt_u16_e64 s0, 0x7f, v1
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index a1a466fb04440d..22996eda955be0 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1327,7 +1327,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
@@ -1353,7 +1353,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
 ; GFX11-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1486,7 +1486,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -1517,7 +1517,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1686,8 +1686,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
-; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, 0xffc0
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
+; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, 0xffc0, v2
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    global_store_short v0, v2, s[0:1]
@@ -1724,8 +1724,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    global_load_u16 v2, v0, s[2:3] glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, v1, 0xffc0
-; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, v2, 0xffc0
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v1, 0xffc0, v1
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v2, 0xffc0, v2
 ; GFX11-GISEL-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    global_store_b16 v0, v2, s[0:1] dlc
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index dd03fb62b8ebb0..82fae44e208186 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -397,7 +397,7 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, v0, 0x3e7
+; GFX11-NEXT:    v_add_nc_u16 v2, 0x3e7, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 5a03a85386e0aa..9839f823ac9f41 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -463,7 +463,7 @@ def have_cxx_shared_library():
         print("could not exec llvm-readobj")
         return False
 
-    readobj_out = readobj_cmd.stdout.read().decode("ascii")
+    readobj_out = readobj_cmd.stdout.read().decode("utf-8")
     readobj_cmd.wait()
 
     regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")

…ructions

easyonaadit · 2024-12-31T07:54:12Z

I think it would be better to add the [AMDGPU] tag in the title.

Shoreshen · 2024-12-31T09:36:12Z

I think it would be better to add the [AMDGPU] tag in the title.

Thanks~~ Done :)

…ibute-to-VOP3-instructions

…ructions

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

arsenm · 2025-01-02T13:24:11Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  auto TargetFlags = NonRegOp1.getTargetFlags();
+  auto NonRegVal = NonRegOp1.getImm();


arsenm · 2025-01-02T13:28:48Z

llvm/lib/Target/AMDGPU/VOP3Instructions.td

@@ -1254,7 +1257,7 @@ let SubtargetPredicate = isGFX10Plus in {
    def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
    def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
  }
-
+  let isCommutable = 1 in


arsenm · 2025-01-02T13:29:07Z

llvm/test/CodeGen/AMDGPU/commute-op-sel.mir

+    %1:vgpr_32 = V_ADD_NC_U16_e64 0, 64, 0, -3, 1, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = V_ADD_NC_U16_e64 0, -3, 0, 64, 1, 0, implicit $mode, implicit $exec
+    DS_WRITE2_B32_gfx9 undef %4:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
+...


There should be a dedicated commute test for every changed opcode like this. What about the _e32 forms?

Hi, for V_ADD_NC_U16 no e32 instruction was created, from ISA it is only listed in VOP3 encoding in section "15.3.4. VOP3
"

arsenm · 2025-01-02T13:29:37Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) {
+    if (isOperandLegal(MI, Src1Idx, &Src0))


At least one of these isInlineConstant checks is redundant with the isOperandLegal check

Hi @arsenm , sorry for replying late, it took me long time to investigate the isOperandLegal function.

By testing, one of the operand is inline constant and isOperandLegal(0MI, Src1Idx, &Src0) == true does not imply the other operand is inline constant
while src0 is true:

while src1 is ture:

It is true for VOP that if Src1 is inline constant, then Src0 is also has to be an inline constant. But the reason for that is if OpIdx does not consist with MO, the MO will count twice for LiteralLimit

I think this may be not consistent with ISA on section "6.12.1 Instruction Inputs" saying:

At most one literal constant can be used, and only when an SGPR or M0 is not used as a source

BTW, the isOperandLegal check may also be necessary, because some of the operand type cannot use inline constant, e.g. second operand of V_ADD_CO_U32_e32, the operand type of it is defined as OPERAND_REGISTER in the tablegen file

Co-authored-by: Matt Arsenault <[email protected]>

…ructions

arsenm · 2025-01-02T16:15:22Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

@@ -2798,7 +2798,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
  } else if (!Src0.isReg() && Src1.isReg()) {
    if (isOperandLegal(MI, Src1Idx, &Src0))
      CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
-  } else if (isInlineConstant(Src0) && isInlineConstant(Src1)) {
+  } else if (isInlineConstant(Src1)) {
+    // If Src1 is inline constant and Src0 is not, then isOperandLegal rejects


This isn't explaining why to do this, but this is also an API flaw that's always been there. We need an isOperandLegal that doesn't account for the context of the other operands

Or rather, one that takes the full set of operands that need to be considered for the result instruction

Hi @arsenm

This isn't explaining why to do this, but this is also an API flaw that's always been there. We need an isOperandLegal that doesn't account for the context of the other operands

isOperandLegal also check the literal and constant bus (literal or SGPR) limit, I think maybe it is better to separate them from the function. It will also cause some in-consistency with ISA (e.g. reject instructions with 1 literal and 1 imm) if the OpIdx and MO's index are different

Or rather, one that takes the full set of operands that need to be considered for the result instruction

We can do that, but to create the result instruction, we actually swapped the operands on the original instruction too. So if I do that and failed the isOperandLegal check, I would need to swap it back

Or rather, one that takes the full set of operands that need to be considered for the result instruction

Hi @arsenm , I tried to check the swapped instruction with isOperandLegal, it fails 100+ cases. The following is the change:

MachineInstr* SIInstrInfo::swapOperands(MachineInstr &MI, bool NewMI, unsigned Src0Idx, unsigned Src1Idx, MachineOperand &Src0, MachineOperand &Src1) const { MachineInstr *CommutedMI = nullptr; if (Src0.isReg() && Src1.isReg()) { // Be sure to copy the source modifiers to the right place. CommutedMI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); } else if (Src0.isReg() && !Src1.isReg()) { CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); } else if (!Src0.isReg() && Src1.isReg()) { CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); } else if (Src0.isImm() && Src1.isImm()) { CommutedMI = swapImmOperands(MI, Src0, Src1); } return CommutedMI; } MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned Src0Idx, unsigned Src1Idx) const { assert(!NewMI && "this should never be used"); unsigned Opc = MI.getOpcode(); int CommutedOpcode = commuteOpcode(Opc); if (CommutedOpcode == -1) return nullptr; if (Src0Idx > Src1Idx) std::swap(Src0Idx, Src1Idx); assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == static_cast<int>(Src1Idx) && "inconsistency with findCommutedOpIndices"); MachineOperand &Src0 = MI.getOperand(Src0Idx); MachineOperand &Src1 = MI.getOperand(Src1Idx); MachineInstr *CommutedMI = swapOperands(MI, NewMI, Src0Idx, Src1Idx, Src0, Src1); if (!CommutedMI) return nullptr; if (!isOperandLegal(*CommutedMI, Src1Idx, &CommutedMI->getOperand(Src1Idx))) { // swap back if failed check swapOperands(MI, NewMI, Src0Idx, Src1Idx, Src0, Src1); return nullptr; } if (CommutedMI) { swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, Src1, AMDGPU::OpName::src1_modifiers); swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1, AMDGPU::OpName::src1_sel); CommutedMI->setDesc(get(CommutedOpcode)); } return CommutedMI; }

The reason for that is the VALU instructions with literal in the first input operands.

The commuteInstruction function was called during the shrink instruction pass, and mismatched OpIdx and MO (first operand) will be the parameter of isOperandLegal. It return false because MO was counted for 2 times of literal limit (explained in the other reply).

Now if using the swapped instruction, the mismatch no longer exists, then it will move all literal constant to the second input operand for all VALU instructions.

Shall I fix all the cases in this PR?? or fix them in a new issue??

…ructions

…ibute-to-VOP3-instructions

…ructions

arsenm · 2025-01-08T13:34:21Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  // No need to check 64bit literals since swapping does not bring new 
+  // 64bit literals into current instruction to fold to 32bit


Suggested change

// No need to check 64bit literals since swapping does not bring new

// 64bit literals into current instruction to fold to 32bit

// No need to check 64-bit literals since swapping does not bring new

// 64-bit literals into current instruction to fold to 32-bit

arsenm · 2025-01-08T13:34:29Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+    Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0X);
+  }
+
+  // Swap doesn't breach constantbus or literal limits


Suggested change

// Swap doesn't breach constantbus or literal limits

// Swap doesn't breach constant bus or literal limits

arsenm · 2025-01-08T13:34:44Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  // Swap doesn't breach constantbus or literal limits
+  // It may move literal to position other than src0, this is not allowed pre-gfx10
+  // However, most test cases need literals in Src0 for VOP
+  // FIX-ME: After gfx9, literal can be in place other than Src0


Suggested change

// FIX-ME: After gfx9, literal can be in place other than Src0

// FIXME: After gfx9, literal can be in place other than Src0

arsenm · 2025-01-08T13:35:41Z

llvm/test/CodeGen/AMDGPU/commute-op-sel.mir

+    %1:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 1, 0, 3481272320, 0, %0, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = nofpexcept V_FMA_F16_e64 0, 3481272320, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE2_B32_gfx9 undef %3:vgpr_32, %1, %2, 0, 1, 0, implicit $exec
+...


Test the src0X cases? I'm not familiar with those

Hi @arsenm , if I'm not wrong, there seems no commutable instruction with input operand named Src0X........

That would be the VOPD forms of instructions, not freestanding instruction definitions

…ibute-to-VOP3-instructions

arsenm · 2025-01-10T06:53:24Z

llvm/test/CodeGen/AMDGPU/commute-op-sel.mir

+    %1:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec
+    %2:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec


Drop nofpexcept, it won't do anything for V_MAD_U32_U16_e64. This also doesn't read mode, I'm surprised the verifier doesn't catch this

arsenm · 2025-01-10T06:53:42Z

llvm/test/CodeGen/AMDGPU/commute-op-sel.mir

+...
+
+---
+name: test_machine_cse_op_v_MAD_u16


Inconsistent capitalization of function name

…ibute-to-VOP3-instructions

…ructions

…ibute-to-VOP3-instructions

github-actions · 2025-01-20T01:25:13Z

✅ With the latest revision this PR passed the C/C++ code formatter.

arsenm · 2025-01-20T16:05:14Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
+  if (IsAGPR && !ST.hasMAIInsts())
+    return false;
+  if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&


This !MRI.reservedRegsFrozen() doesn't make any sense but all of this code looks copied directly from existing code

Yep, I just wrap it from the isoperandlegal function to avoid duplicate code. Should I try to remove it??

arsenm · 2025-01-20T16:08:28Z

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  if (Src0Idx == -1) {
+    // VOPD V_DUAL_* instructions use different operand names.
+    Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0X);


This VOPD case is still missing a test

Hi @arsenm , I re-searched all generated instruction, but cannot find a VOPD instruction that is commutable with operand names as $src0X...

Then remove this handling?

…ructions

arsenm · 2025-01-22T04:04:50Z

llvm/test/CodeGen/AMDGPU/cmp_shrink.mir

@@ -7,6 +7,6 @@ name:             not_shrink_icmp
 body:             |
  bb.0:
    ; GCN-LABEL: name: not_shrink_icmp
-    ; GCN: S_CMP_GT_I32 1, 65, implicit-def $scc
+    ; GCN: S_CMP_LT_I32 65, 1, implicit-def $scc


Pointless swap but probably not this patch's problem

github-actions · 2025-01-22T04:08:48Z

@Shoreshen Congratulations on having your first Pull Request (PR) merged into the LLVM Project!

Your changes will be combined with recent changes from other authors, then tested by our build bots. If there is a problem with a build, you may receive a report in an email or a comment on this PR.

Please check whether problems have been caused by your change specifically, as the builds can include changes from many authors. It is not uncommon for your change to be included in a build that fails due to someone else's changes, or infrastructure issues.

How to do this, and the rest of the post-merge process, is covered in detail here.

If your change does cause a problem, it may be reverted, or you can revert it yourself. This is a normal part of LLVM development. You can fix your changes and open a new PR to merge them again.

If you don't get any reports, no action is required from you. Your changes are working as expected, well done!

add commute for some VOP3 inst, allow commute for both inline constan…

f0877bb

…t operand, adjust tests

llvmbot added the backend:AMDGPU label Dec 30, 2024

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

ae5f1e8

…ructions

Shoreshen changed the title ~~Add commute for some VOP3 inst~~ [AMDGPU] Add commute for some VOP3 inst Dec 31, 2024

Shoreshen added 5 commits December 31, 2024 23:15

Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…

df6903c

…ibute-to-VOP3-instructions

add inline constant case & merge main

ada83d6

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

c599af0

…ructions

fix lit change

3a83ae5

fix lit change

5c9d065

arsenm reviewed Jan 2, 2025

View reviewed changes

Shoreshen and others added 4 commits January 2, 2025 21:53

Update llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

d3f00dc

Co-authored-by: Matt Arsenault <[email protected]>

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

0e60298

…ructions

fix comments

707474f

fix

79219d1

arsenm reviewed Jan 2, 2025

View reviewed changes

Shoreshen added 4 commits January 4, 2025 13:49

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

288ead2

…ructions

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

4becc58

…ructions

Add legal check for swap

53b370a

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

0b746a8

…ructions

Shoreshen requested a review from arsenm January 6, 2025 13:52

Shoreshen added 6 commits January 7, 2025 16:43

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

785921f

…ructions

Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…

6955a86

…ibute-to-VOP3-instructions

add tests & merge_main

5e15e72

fix lit.cfg.py

004a82d

fix lit.cfg.py

dc2739f

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

378c02c

…ructions

arsenm reviewed Jan 8, 2025

View reviewed changes

Shoreshen added 2 commits January 9, 2025 10:47

Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…

49ae569

…ibute-to-VOP3-instructions

adjust comment & merge main

161a2b9

Shoreshen requested a review from arsenm January 10, 2025 02:05

arsenm reviewed Jan 10, 2025

View reviewed changes

Shoreshen added 5 commits January 10, 2025 17:32

Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…

4d569ae

…ibute-to-VOP3-instructions

adjust case & merge main

1689c1e

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

328e566

…ructions

fix inconsistent capitalization

9faf423

fix test case

19b8ad4

Shoreshen requested review from arsenm January 13, 2025 09:06

Shoreshen added 2 commits January 20, 2025 09:01

Merge remote-tracking branch 'origin/main' into Add-isCommutable-attr…

cc3a125

…ibute-to-VOP3-instructions

merge main

0a89dc9

fix format

d8e6cb7

arsenm approved these changes Jan 20, 2025

View reviewed changes

arsenm reviewed Jan 20, 2025

View reviewed changes

Shoreshen added 3 commits January 21, 2025 09:16

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

3c7bd89

…ructions

remove special handling for VOPD, since no commutable VOPD instruction

bf1da57

Merge branch 'llvm:main' into Add-isCommutable-attribute-to-VOP3-inst…

789092d

…ructions

Shoreshen mentioned this pull request Jan 21, 2025

Request Commit Access For Shoreshen #119686

Closed

Merge branch 'main' into Add-isCommutable-attribute-to-VOP3-instructions

6c35280

arsenm approved these changes Jan 22, 2025

View reviewed changes

arsenm merged commit 7c58d63 into llvm:main Jan 22, 2025
6 of 8 checks passed

Shoreshen deleted the Add-isCommutable-attribute-to-VOP3-instructions branch January 22, 2025 07:49

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Add commute for some VOP3 inst #121326

[AMDGPU] Add commute for some VOP3 inst #121326

Shoreshen commented Dec 30, 2024 •

edited by arsenm

Loading

github-actions bot commented Dec 30, 2024

llvmbot commented Dec 30, 2024

easyonaadit commented Dec 31, 2024

Shoreshen commented Dec 31, 2024

arsenm Jan 2, 2025

arsenm Jan 2, 2025

arsenm Jan 2, 2025

Shoreshen Jan 3, 2025

arsenm Jan 2, 2025

Shoreshen Jan 3, 2025

Shoreshen Jan 3, 2025 •

edited

Loading

arsenm Jan 2, 2025

arsenm Jan 2, 2025

Shoreshen Jan 3, 2025

Shoreshen Jan 3, 2025 •

edited

Loading

arsenm Jan 8, 2025

arsenm Jan 8, 2025

arsenm Jan 8, 2025

arsenm Jan 8, 2025

Shoreshen Jan 9, 2025 •

edited

Loading

arsenm Jan 10, 2025

arsenm Jan 10, 2025

arsenm Jan 10, 2025

github-actions bot commented Jan 20, 2025 •

edited

Loading

arsenm Jan 20, 2025

Shoreshen Jan 20, 2025 •

edited

Loading

arsenm Jan 20, 2025

arsenm Jan 20, 2025

Shoreshen Jan 21, 2025

arsenm Jan 21, 2025

arsenm Jan 22, 2025

github-actions bot commented Jan 22, 2025

		auto TargetFlags = NonRegOp1.getTargetFlags();
		auto NonRegVal = NonRegOp1.getImm();

		} else if (isInlineConstant(Src0) && isInlineConstant(Src1)) {
		if (isOperandLegal(MI, Src1Idx, &Src0))

		// No need to check 64bit literals since swapping does not bring new
		// 64bit literals into current instruction to fold to 32bit

	// Swap doesn't breach constantbus or literal limits
	// Swap doesn't breach constant bus or literal limits

	// FIX-ME: After gfx9, literal can be in place other than Src0
	// FIXME: After gfx9, literal can be in place other than Src0

		%1:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 1, 0, 64, 0, %0, 0, 0, implicit $mode, implicit $exec
		%2:vgpr_32 = nofpexcept V_MAD_U32_U16_e64 0, 64, 0, 1, 0, %0, 0, 0, implicit $mode, implicit $exec

[AMDGPU] Add commute for some VOP3 inst #121326

[AMDGPU] Add commute for some VOP3 inst #121326

Conversation

Shoreshen commented Dec 30, 2024 • edited by arsenm Loading

github-actions bot commented Dec 30, 2024

llvmbot commented Dec 30, 2024

easyonaadit commented Dec 31, 2024

Shoreshen commented Dec 31, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Shoreshen Jan 3, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Shoreshen Jan 3, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Shoreshen Jan 9, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Jan 20, 2025 • edited Loading

Choose a reason for hiding this comment

Shoreshen Jan 20, 2025 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Jan 22, 2025

Shoreshen commented Dec 30, 2024 •

edited by arsenm

Loading

Shoreshen Jan 3, 2025 •

edited

Loading

Shoreshen Jan 3, 2025 •

edited

Loading

Shoreshen Jan 9, 2025 •

edited

Loading

github-actions bot commented Jan 20, 2025 •

edited

Loading

Shoreshen Jan 20, 2025 •

edited

Loading