Skip to content

Commit 520757c

Browse files
committed
[AMDGPU][SIFoldOperands] Fold some redundant bitmasks
Instructions like shifts only read some of the bits of the shift amount operand, between 4 and 6 bits. If the source operand is being masked, we can just ignore the mask. Effects are minimal right now but this will kick in more once we disable uniform i16 operation widening in CGP. With that disabled, we get more i16 shift amounts that are zext'd and without this we'd end up with more `s_and_b32 s1, s1, 0xFFFF` in the output. Ideally ISel should handle this but it's proving difficult to get the patterns right, and after a few hours of trying I just decided to go with this as it's simple enough and it "just works" for this purpose.
1 parent 6db5fe8 commit 520757c

File tree

8 files changed

+303
-251
lines changed

8 files changed

+303
-251
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ class SIFoldOperandsImpl {
131131
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
132132
bool tryConstantFoldOp(MachineInstr *MI) const;
133133
bool tryFoldCndMask(MachineInstr &MI) const;
134+
bool tryFoldBitMask(MachineInstr &MI) const;
134135
bool tryFoldZeroHighBits(MachineInstr &MI) const;
135136
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
136137

@@ -1447,6 +1448,99 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
14471448
return true;
14481449
}
14491450

1451+
static bool getBitsReadByInst(unsigned Opc, unsigned &NumBitsRead,
1452+
unsigned &OpIdx) {
1453+
switch (Opc) {
1454+
case AMDGPU::V_ASHR_I32_e64:
1455+
case AMDGPU::V_ASHR_I32_e32:
1456+
case AMDGPU::V_LSHR_B32_e64:
1457+
case AMDGPU::V_LSHR_B32_e32:
1458+
case AMDGPU::V_LSHL_B32_e64:
1459+
case AMDGPU::V_LSHL_B32_e32:
1460+
case AMDGPU::S_LSHL_B32:
1461+
case AMDGPU::S_LSHR_B32:
1462+
case AMDGPU::S_ASHR_I32:
1463+
NumBitsRead = 5;
1464+
OpIdx = 2;
1465+
return true;
1466+
case AMDGPU::S_LSHL_B64:
1467+
case AMDGPU::S_LSHR_B64:
1468+
case AMDGPU::S_ASHR_I64:
1469+
NumBitsRead = 6;
1470+
OpIdx = 2;
1471+
return true;
1472+
case AMDGPU::V_LSHLREV_B32_e64:
1473+
case AMDGPU::V_LSHLREV_B32_e32:
1474+
case AMDGPU::V_LSHRREV_B32_e64:
1475+
case AMDGPU::V_LSHRREV_B32_e32:
1476+
case AMDGPU::V_ASHRREV_I32_e64:
1477+
case AMDGPU::V_ASHRREV_I32_e32:
1478+
NumBitsRead = 5;
1479+
OpIdx = 1;
1480+
return true;
1481+
default:
1482+
return false;
1483+
}
1484+
}
1485+
1486+
static bool isAndBitMaskRedundant(MachineInstr &MI, unsigned BitsNeeded,
1487+
unsigned &SrcOp) {
1488+
MachineOperand *RegOp = &MI.getOperand(1);
1489+
MachineOperand *ImmOp = &MI.getOperand(2);
1490+
1491+
if (!RegOp->isReg() || !ImmOp->isImm()) {
1492+
if (ImmOp->isReg() && RegOp->isImm())
1493+
std::swap(RegOp, ImmOp);
1494+
else
1495+
return false;
1496+
}
1497+
1498+
SrcOp = RegOp->getOperandNo();
1499+
1500+
const unsigned BitMask = maskTrailingOnes<unsigned>(BitsNeeded);
1501+
return (ImmOp->getImm() & BitMask) == BitMask;
1502+
}
1503+
1504+
bool SIFoldOperandsImpl::tryFoldBitMask(MachineInstr &MI) const {
1505+
unsigned NumBitsRead = 0;
1506+
unsigned OpIdx = 0;
1507+
if (!getBitsReadByInst(MI.getOpcode(), NumBitsRead, OpIdx))
1508+
return false;
1509+
1510+
MachineOperand &Op = MI.getOperand(OpIdx);
1511+
if (!Op.isReg())
1512+
return false;
1513+
1514+
Register OpReg = Op.getReg();
1515+
if (OpReg.isPhysical())
1516+
return false;
1517+
1518+
MachineInstr *OpDef = MRI->getVRegDef(OpReg);
1519+
if (!OpDef)
1520+
return false ;
1521+
1522+
LLVM_DEBUG(dbgs() << "tryFoldBitMask: " << MI << "\tOpIdx:" << OpIdx << ", NumBitsRead:" << NumBitsRead << "\n");
1523+
1524+
unsigned ReplaceWith;
1525+
switch (OpDef->getOpcode()) {
1526+
// TODO: add more opcodes?
1527+
case AMDGPU::S_AND_B32:
1528+
case AMDGPU::V_AND_B32_e32:
1529+
case AMDGPU::V_AND_B32_e64:
1530+
if (!isAndBitMaskRedundant(*OpDef, NumBitsRead, ReplaceWith))
1531+
return false;
1532+
break;
1533+
default:
1534+
return false;
1535+
}
1536+
1537+
MachineOperand &ReplaceWithOp = OpDef->getOperand(ReplaceWith);
1538+
LLVM_DEBUG(dbgs() << "\treplacing operand with:" << ReplaceWithOp << "\n");
1539+
1540+
MI.getOperand(OpIdx).setReg(ReplaceWithOp.getReg());
1541+
return true;
1542+
}
1543+
14501544
bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
14511545
if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
14521546
MI.getOpcode() != AMDGPU::V_AND_B32_e32)
@@ -1458,7 +1552,7 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
14581552

14591553
Register Src1 = MI.getOperand(2).getReg();
14601554
MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1461-
if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1555+
if (!SrcDef || !ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
14621556
return false;
14631557

14641558
Register Dst = MI.getOperand(0).getReg();
@@ -2451,6 +2545,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
24512545
MachineOperand *CurrentKnownM0Val = nullptr;
24522546
for (auto &MI : make_early_inc_range(*MBB)) {
24532547
Changed |= tryFoldCndMask(MI);
2548+
Changed |= tryFoldBitMask(MI);
24542549

24552550
if (tryFoldZeroHighBits(MI)) {
24562551
Changed = true;

llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ define i24 @v_ashr_i24(i24 %value, i24 %amount) {
129129
; GFX10PLUS-LABEL: v_ashr_i24:
130130
; GFX10PLUS: ; %bb.0:
131131
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132-
; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1
133132
; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 24
134133
; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v1, v0
135134
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
@@ -718,7 +717,6 @@ define amdgpu_ps half @ashr_i16_sv(i16 inreg %value, i16 %amount) {
718717
define amdgpu_ps half @ashr_i16_vs(i16 %value, i16 inreg %amount) {
719718
; GFX6-LABEL: ashr_i16_vs:
720719
; GFX6: ; %bb.0:
721-
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
722720
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
723721
; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0
724722
; GFX6-NEXT: ; return to shader part epilog
@@ -904,12 +902,10 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
904902
define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
905903
; GFX6-LABEL: ashr_v2i16_vs:
906904
; GFX6: ; %bb.0:
907-
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
905+
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
908906
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
907+
; GFX6-NEXT: v_ashrrev_i32_e32 v1, s1, v1
909908
; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0
910-
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
911-
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
912-
; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1
913909
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
914910
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
915911
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1

0 commit comments

Comments
 (0)