Skip to content

Commit c128818

Browse files
committed
[AMDGPU] Fold uniform readfirstlane + cndmask
(Alternative patch for llvm#69703) Teach SIFoldOperand to fold the a/zext DAGISel pattern that always emits a CNDMASK + READFIRSTLANE, even for uniform comparisons. Fixes llvm#59869
1 parent bf3a981 commit c128818

File tree

5 files changed

+670
-110
lines changed

5 files changed

+670
-110
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class SIFoldOperands : public MachineFunctionPass {
104104
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
105105
bool tryFoldFoldableCopy(MachineInstr &MI,
106106
MachineOperand *&CurrentKnownM0Val) const;
107+
bool tryFoldUniformReadFirstLaneCndMask(MachineInstr &MI) const;
107108

108109
const MachineOperand *isClamp(const MachineInstr &MI) const;
109110
bool tryFoldClamp(MachineInstr &MI);
@@ -1400,6 +1401,88 @@ bool SIFoldOperands::tryFoldFoldableCopy(
14001401
return Changed;
14011402
}
14021403

1404+
// Try to fold the following pattern:
1405+
// s_cselect s[2:3], K, 0 ; K has LSB set. Usually it's +-1.
1406+
// v_cndmask v0, 0, +-1, s[2:3]
1407+
// v_readfirstlane s0, v0
1408+
//
1409+
// into (for example)
1410+
//
1411+
// s_cselect s[2:3], K, 0
1412+
// s_bfe_u64 s0, s[2:3], 0x10000
1413+
bool SIFoldOperands::tryFoldUniformReadFirstLaneCndMask(
1414+
MachineInstr &MI) const {
1415+
if (MI.getOpcode() != AMDGPU::V_READFIRSTLANE_B32)
1416+
return false;
1417+
1418+
MachineInstr *RFLSrc = MRI->getVRegDef(MI.getOperand(1).getReg());
1419+
// We can also have the following pattern:
1420+
//
1421+
// %2:vreg_64 = REG_SEQUENCE %X:vgpr_32, sub0, %1:sreg_32, sub1
1422+
// %3:sgpr_32 = V_READFIRSTLANE_B32 %2.sub0:vreg_64
1423+
//
1424+
// In this case we dig into %X or %Y depending on which sub register
1425+
// the V_READFIRSTLANE accesses.
1426+
if (RFLSrc->isRegSequence()) {
1427+
unsigned RFLSubReg = MI.getOperand(1).getSubReg();
1428+
if (RFLSrc->getNumOperands() != 5)
1429+
return false;
1430+
1431+
if (RFLSrc->getOperand(2).getImm() == RFLSubReg)
1432+
RFLSrc = MRI->getVRegDef(RFLSrc->getOperand(1).getReg());
1433+
else if (RFLSrc->getOperand(4).getImm() == RFLSubReg)
1434+
RFLSrc = MRI->getVRegDef(RFLSrc->getOperand(3).getReg());
1435+
else
1436+
return false;
1437+
}
1438+
1439+
// Need e64 to have a SGPR regmask.
1440+
if (!RFLSrc || RFLSrc->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
1441+
return false;
1442+
1443+
MachineOperand *Src0 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src0);
1444+
MachineOperand *Src1 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src1);
1445+
Register Src2 = TII->getNamedOperand(*RFLSrc, AMDGPU::OpName::src2)->getReg();
1446+
1447+
if (!Src0->isImm() || Src0->getImm() != 0 || !Src1->isImm())
1448+
return false;
1449+
1450+
// This pattern usually comes from a ext. sext uses -1.
1451+
bool IsSigned = false;
1452+
if (Src1->getImm() == -1)
1453+
IsSigned = true;
1454+
else if (Src1->getImm() != 1)
1455+
return false;
1456+
1457+
MachineInstr *CSel = MRI->getVRegDef(Src2);
1458+
if (!CSel || (CSel->getOpcode() != AMDGPU::S_CSELECT_B32 &&
1459+
CSel->getOpcode() != AMDGPU::S_CSELECT_B64))
1460+
return false;
1461+
1462+
MachineOperand *CSelSrc0 = TII->getNamedOperand(*CSel, AMDGPU::OpName::src0);
1463+
MachineOperand *CSelSrc1 = TII->getNamedOperand(*CSel, AMDGPU::OpName::src1);
1464+
// Note: we could also allow any non-zero value for CSelSrc0, and adapt the
1465+
// BFE's mask depending on where the first set bit is.
1466+
if (!CSelSrc0->isImm() || (CSelSrc0->getImm() & 1) == 0 ||
1467+
!CSelSrc1->isImm() || CSelSrc1->getImm() != 0)
1468+
return false;
1469+
1470+
// Replace the V_CNDMASK with S_BFE.
1471+
unsigned BFEOpc = (IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32);
1472+
1473+
// If the CSELECT writes to a 64 bit SGPR, only pick the low bits.
1474+
unsigned SubReg = 0;
1475+
if (CSel->getOpcode() == AMDGPU::S_CSELECT_B64)
1476+
SubReg = AMDGPU::sub0;
1477+
1478+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(BFEOpc),
1479+
MI.getOperand(0).getReg())
1480+
.addReg(Src2, /*Flags*/ 0, SubReg)
1481+
.addImm(0x10000);
1482+
MI.eraseFromParent();
1483+
return true;
1484+
}
1485+
14031486
// Clamp patterns are canonically selected to v_max_* instructions, so only
14041487
// handle them.
14051488
const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
@@ -2087,6 +2170,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
20872170
continue;
20882171
}
20892172

2173+
if (tryFoldUniformReadFirstLaneCndMask(MI)) {
2174+
Changed = true;
2175+
continue;
2176+
}
2177+
20902178
// Saw an unknown clobber of m0, so we no longer know what it is.
20912179
if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
20922180
CurrentKnownM0Val = nullptr;

llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,8 +1536,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
15361536
; SI-NEXT: s_or_b32 s2, s5, s2
15371537
; SI-NEXT: s_cmp_lg_u32 s2, 0
15381538
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1539-
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1540-
; SI-NEXT: v_readfirstlane_b32 s2, v1
1539+
; SI-NEXT: s_bfe_u32 s2, s4, 0x10000
15411540
; SI-NEXT: s_bfe_u32 s5, s3, 0xb0014
15421541
; SI-NEXT: s_or_b32 s2, s6, s2
15431542
; SI-NEXT: s_sub_i32 s6, 0x3f1, s5
@@ -1599,8 +1598,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
15991598
; VI-NEXT: s_or_b32 s0, s1, s6
16001599
; VI-NEXT: s_cmp_lg_u32 s0, 0
16011600
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1602-
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1603-
; VI-NEXT: v_readfirstlane_b32 s0, v2
1601+
; VI-NEXT: s_bfe_u32 s0, s0, 0x10000
16041602
; VI-NEXT: s_bfe_u32 s1, s7, 0xb0014
16051603
; VI-NEXT: v_mov_b32_e32 v0, s4
16061604
; VI-NEXT: s_or_b32 s4, s2, s0
@@ -1661,8 +1659,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
16611659
; GFX9-NEXT: s_or_b32 s0, s1, s6
16621660
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
16631661
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
1664-
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1665-
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
1662+
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x10000
16661663
; GFX9-NEXT: s_bfe_u32 s1, s7, 0xb0014
16671664
; GFX9-NEXT: s_or_b32 s6, s2, s0
16681665
; GFX9-NEXT: s_sub_i32 s2, 0x3f1, s1
@@ -1714,63 +1711,61 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
17141711
; GFX11-NEXT: s_clause 0x1
17151712
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
17161713
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
1714+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
17171715
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
17181716
; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff
17191717
; GFX11-NEXT: s_lshr_b32 s2, s7, 8
17201718
; GFX11-NEXT: s_or_b32 s1, s1, s6
17211719
; GFX11-NEXT: s_and_b32 s2, s2, 0xffe
17221720
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
17231721
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
1724-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1725-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
1726-
; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014
1727-
; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1
1728-
; GFX11-NEXT: s_addk_i32 s1, 0xfc10
1729-
; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
1730-
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1731-
; GFX11-NEXT: s_lshl_b32 s8, s1, 12
1732-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1733-
; GFX11-NEXT: v_readfirstlane_b32 s6, v1
1734-
; GFX11-NEXT: s_or_b32 s2, s2, s3
1735-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1736-
; GFX11-NEXT: s_or_b32 s3, s2, 0x1000
1737-
; GFX11-NEXT: s_or_b32 s8, s2, s8
1738-
; GFX11-NEXT: s_lshr_b32 s6, s3, s6
1739-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1740-
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6
1741-
; GFX11-NEXT: v_mov_b32_e32 v1, 0
1742-
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0
1722+
; GFX11-NEXT: s_bfe_u32 s3, s7, 0xb0014
1723+
; GFX11-NEXT: s_bfe_u32 s1, s1, 0x10000
1724+
; GFX11-NEXT: s_sub_i32 s6, 0x3f1, s3
1725+
; GFX11-NEXT: s_or_b32 s1, s2, s1
1726+
; GFX11-NEXT: v_med3_i32 v0, s6, 0, 13
1727+
; GFX11-NEXT: s_or_b32 s2, s1, 0x1000
1728+
; GFX11-NEXT: s_addk_i32 s3, 0xfc10
1729+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1730+
; GFX11-NEXT: s_lshl_b32 s8, s3, 12
1731+
; GFX11-NEXT: v_readfirstlane_b32 s6, v0
1732+
; GFX11-NEXT: s_or_b32 s8, s1, s8
1733+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1734+
; GFX11-NEXT: s_lshr_b32 s6, s2, s6
1735+
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s6
1736+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1737+
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
17431738
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1744-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1745-
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1746-
; GFX11-NEXT: s_or_b32 s3, s6, s3
1747-
; GFX11-NEXT: s_cmp_lt_i32 s1, 1
1748-
; GFX11-NEXT: s_cselect_b32 s3, s3, s8
1749-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1750-
; GFX11-NEXT: s_and_b32 s6, s3, 7
1739+
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
1740+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1741+
; GFX11-NEXT: s_or_b32 s2, s6, s2
1742+
; GFX11-NEXT: s_cmp_lt_i32 s3, 1
1743+
; GFX11-NEXT: s_cselect_b32 s2, s2, s8
1744+
; GFX11-NEXT: s_and_b32 s6, s2, 7
1745+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17511746
; GFX11-NEXT: s_cmp_gt_i32 s6, 5
17521747
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
17531748
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
17541749
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
1755-
; GFX11-NEXT: s_lshr_b32 s3, s3, 2
1750+
; GFX11-NEXT: s_lshr_b32 s2, s2, 2
17561751
; GFX11-NEXT: s_or_b32 s6, s6, s8
17571752
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
17581753
; GFX11-NEXT: s_cmp_lg_u32 s6, 0
1759-
; GFX11-NEXT: s_addc_u32 s3, s3, 0
1760-
; GFX11-NEXT: s_cmp_lt_i32 s1, 31
1761-
; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00
1762-
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1763-
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
1764-
; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f
1765-
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
1754+
; GFX11-NEXT: s_addc_u32 s2, s2, 0
1755+
; GFX11-NEXT: s_cmp_lt_i32 s3, 31
1756+
; GFX11-NEXT: s_cselect_b32 s2, s2, 0x7c00
1757+
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
1758+
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
1759+
; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x40f
1760+
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
17661761
; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
17671762
; GFX11-NEXT: s_lshr_b32 s1, s7, 16
17681763
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17691764
; GFX11-NEXT: s_and_b32 s1, s1, 0x8000
17701765
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0
17711766
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17721767
; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0
1773-
; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo
1768+
; GFX11-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
17741769
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
17751770
; GFX11-NEXT: v_or_b32_e32 v0, s1, v0
17761771
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0

0 commit comments

Comments
 (0)