Skip to content

Commit 770393b

Browse files
authored
[MachineLICM] Correctly Apply Register Masks (#95746)
Fix regression introduced in d4b8b72
1 parent c2d9f25 commit 770393b

File tree

2 files changed

+15
-24
lines changed

2 files changed

+15
-24
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -426,38 +426,29 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
426426
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
427427
BitVector &RUs,
428428
const uint32_t *Mask) {
429-
// Iterate over the RegMask raw to avoid constructing a BitVector, which is
430-
// expensive as it implies dynamically allocating memory.
431-
//
432-
// We also work backwards.
429+
BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
433430
const unsigned NumRegs = TRI.getNumRegs();
434431
const unsigned MaskWords = (NumRegs + 31) / 32;
435432
for (unsigned K = 0; K < MaskWords; ++K) {
436-
// We want to set the bits that aren't in RegMask, so flip it.
437-
uint32_t Word = ~Mask[K];
438-
439-
// Iterate all set bits, starting from the right.
440-
while (Word) {
441-
const unsigned SetBitIdx = countr_zero(Word);
442-
443-
// The bits are numbered from the LSB in each word.
444-
const unsigned PhysReg = (K * 32) + SetBitIdx;
445-
446-
// Clear the bit at SetBitIdx. Doing it this way appears to generate less
447-
// instructions on x86. This works because negating a number will flip all
448-
// the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
449-
// faster.
450-
Word ^= Word & -Word;
433+
const uint32_t Word = Mask[K];
434+
if (!Word)
435+
continue;
451436

437+
for (unsigned Bit = 0; Bit < 32; ++Bit) {
438+
const unsigned PhysReg = (K * 32) + Bit;
452439
if (PhysReg == NumRegs)
453-
return;
440+
break;
454441

455-
if (PhysReg) {
442+
// Check if we have a valid PhysReg that is set in the mask.
443+
// FIXME: We shouldn't have to check for PhysReg.
444+
if (PhysReg && ((Word >> Bit) & 1)) {
456445
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
457-
RUs.set(*RUI);
446+
ClobberedRUs.reset(*RUI);
458447
}
459448
}
460449
}
450+
451+
RUs |= ClobberedRUs;
461452
}
462453

463454
/// Examine the instruction for potentai LICM candidate. Also

llvm/test/CodeGen/AMDGPU/indirect-call.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
886886
; GCN-NEXT: v_writelane_b32 v40, s62, 30
887887
; GCN-NEXT: v_writelane_b32 v40, s63, 31
888888
; GCN-NEXT: s_mov_b64 s[6:7], exec
889+
; GCN-NEXT: s_movk_i32 s4, 0x7b
889890
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
890891
; GCN-NEXT: v_readfirstlane_b32 s8, v0
891892
; GCN-NEXT: v_readfirstlane_b32 s9, v1
892893
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
893894
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
894-
; GCN-NEXT: s_movk_i32 s4, 0x7b
895895
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
896896
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
897897
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
980980
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
981981
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
982982
; GISEL-NEXT: s_mov_b64 s[6:7], exec
983+
; GISEL-NEXT: s_movk_i32 s4, 0x7b
983984
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
984985
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
985986
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
986987
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
987988
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
988-
; GISEL-NEXT: s_movk_i32 s4, 0x7b
989989
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
990990
; GISEL-NEXT: ; implicit-def: $vgpr0
991991
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]

0 commit comments

Comments
 (0)