@@ -131,6 +131,7 @@ class SIFoldOperandsImpl {
131
131
std::optional<int64_t > getImmOrMaterializedImm (MachineOperand &Op) const ;
132
132
bool tryConstantFoldOp (MachineInstr *MI) const ;
133
133
bool tryFoldCndMask (MachineInstr &MI) const ;
134
+ bool tryFoldBitMask (MachineInstr &MI) const ;
134
135
bool tryFoldZeroHighBits (MachineInstr &MI) const ;
135
136
bool foldInstOperand (MachineInstr &MI, MachineOperand &OpToFold) const ;
136
137
@@ -1447,6 +1448,99 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1447
1448
return true ;
1448
1449
}
1449
1450
1451
+ static bool getBitsReadByInst (unsigned Opc, unsigned &NumBitsRead,
1452
+ unsigned &OpIdx) {
1453
+ switch (Opc) {
1454
+ case AMDGPU::V_ASHR_I32_e64:
1455
+ case AMDGPU::V_ASHR_I32_e32:
1456
+ case AMDGPU::V_LSHR_B32_e64:
1457
+ case AMDGPU::V_LSHR_B32_e32:
1458
+ case AMDGPU::V_LSHL_B32_e64:
1459
+ case AMDGPU::V_LSHL_B32_e32:
1460
+ case AMDGPU::S_LSHL_B32:
1461
+ case AMDGPU::S_LSHR_B32:
1462
+ case AMDGPU::S_ASHR_I32:
1463
+ NumBitsRead = 5 ;
1464
+ OpIdx = 2 ;
1465
+ return true ;
1466
+ case AMDGPU::S_LSHL_B64:
1467
+ case AMDGPU::S_LSHR_B64:
1468
+ case AMDGPU::S_ASHR_I64:
1469
+ NumBitsRead = 6 ;
1470
+ OpIdx = 2 ;
1471
+ return true ;
1472
+ case AMDGPU::V_LSHLREV_B32_e64:
1473
+ case AMDGPU::V_LSHLREV_B32_e32:
1474
+ case AMDGPU::V_LSHRREV_B32_e64:
1475
+ case AMDGPU::V_LSHRREV_B32_e32:
1476
+ case AMDGPU::V_ASHRREV_I32_e64:
1477
+ case AMDGPU::V_ASHRREV_I32_e32:
1478
+ NumBitsRead = 5 ;
1479
+ OpIdx = 1 ;
1480
+ return true ;
1481
+ default :
1482
+ return false ;
1483
+ }
1484
+ }
1485
+
1486
+ static bool isAndBitMaskRedundant (MachineInstr &MI, unsigned BitsNeeded,
1487
+ unsigned &SrcOp) {
1488
+ MachineOperand *RegOp = &MI.getOperand (1 );
1489
+ MachineOperand *ImmOp = &MI.getOperand (2 );
1490
+
1491
+ if (!RegOp->isReg () || !ImmOp->isImm ()) {
1492
+ if (ImmOp->isReg () && RegOp->isImm ())
1493
+ std::swap (RegOp, ImmOp);
1494
+ else
1495
+ return false ;
1496
+ }
1497
+
1498
+ SrcOp = RegOp->getOperandNo ();
1499
+
1500
+ const unsigned BitMask = maskTrailingOnes<unsigned >(BitsNeeded);
1501
+ return (ImmOp->getImm () & BitMask) == BitMask;
1502
+ }
1503
+
1504
+ bool SIFoldOperandsImpl::tryFoldBitMask (MachineInstr &MI) const {
1505
+ unsigned NumBitsRead = 0 ;
1506
+ unsigned OpIdx = 0 ;
1507
+ if (!getBitsReadByInst (MI.getOpcode (), NumBitsRead, OpIdx))
1508
+ return false ;
1509
+
1510
+ MachineOperand &Op = MI.getOperand (OpIdx);
1511
+ if (!Op.isReg ())
1512
+ return false ;
1513
+
1514
+ Register OpReg = Op.getReg ();
1515
+ if (OpReg.isPhysical ())
1516
+ return false ;
1517
+
1518
+ MachineInstr *OpDef = MRI->getVRegDef (OpReg);
1519
+ if (!OpDef)
1520
+ return false ;
1521
+
1522
+ LLVM_DEBUG (dbgs () << " tryFoldBitMask: " << MI << " \t OpIdx:" << OpIdx << " , NumBitsRead:" << NumBitsRead << " \n " );
1523
+
1524
+ unsigned ReplaceWith;
1525
+ switch (OpDef->getOpcode ()) {
1526
+ // TODO: add more opcodes?
1527
+ case AMDGPU::S_AND_B32:
1528
+ case AMDGPU::V_AND_B32_e32:
1529
+ case AMDGPU::V_AND_B32_e64:
1530
+ if (!isAndBitMaskRedundant (*OpDef, NumBitsRead, ReplaceWith))
1531
+ return false ;
1532
+ break ;
1533
+ default :
1534
+ return false ;
1535
+ }
1536
+
1537
+ MachineOperand &ReplaceWithOp = OpDef->getOperand (ReplaceWith);
1538
+ LLVM_DEBUG (dbgs () << " \t replacing operand with:" << ReplaceWithOp << " \n " );
1539
+
1540
+ MI.getOperand (OpIdx).setReg (ReplaceWithOp.getReg ());
1541
+ return true ;
1542
+ }
1543
+
1450
1544
bool SIFoldOperandsImpl::tryFoldZeroHighBits (MachineInstr &MI) const {
1451
1545
if (MI.getOpcode () != AMDGPU::V_AND_B32_e64 &&
1452
1546
MI.getOpcode () != AMDGPU::V_AND_B32_e32)
@@ -1458,7 +1552,7 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1458
1552
1459
1553
Register Src1 = MI.getOperand (2 ).getReg ();
1460
1554
MachineInstr *SrcDef = MRI->getVRegDef (Src1);
1461
- if (!ST->zeroesHigh16BitsOfDest (SrcDef->getOpcode ()))
1555
+ if (!SrcDef || ! ST->zeroesHigh16BitsOfDest (SrcDef->getOpcode ()))
1462
1556
return false ;
1463
1557
1464
1558
Register Dst = MI.getOperand (0 ).getReg ();
@@ -2451,6 +2545,7 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2451
2545
MachineOperand *CurrentKnownM0Val = nullptr ;
2452
2546
for (auto &MI : make_early_inc_range (*MBB)) {
2453
2547
Changed |= tryFoldCndMask (MI);
2548
+ Changed |= tryFoldBitMask (MI);
2454
2549
2455
2550
if (tryFoldZeroHighBits (MI)) {
2456
2551
Changed = true ;
0 commit comments