Skip to content

Commit 56af0e9

Browse files
committed
[EarlyCSE] Do not CSE convergent calls in different basic blocks
"convergent" is documented as meaning that the call cannot be made control-dependent on more values, but in practice we also require that it cannot be made control-dependent on fewer values, e.g. it cannot be hoisted out of the body of an "if" statement. In code like this, if we allow CSE to combine the two calls: x = convergent_call(); if (cond) { y = convergent_call(); use y; } then we get this: x = convergent_call(); if (cond) { use x; } This is conceptually equivalent to moving the second call out of the body of the "if", up to the location of the first call, so it should be disallowed. Differential Revision: https://reviews.llvm.org/D149348
1 parent 5534d1d commit 56af0e9

File tree

2 files changed

+31
-5
lines changed

2 files changed

+31
-5
lines changed

llvm/lib/Transforms/Scalar/EarlyCSE.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,14 @@ static unsigned getHashValueImpl(SimpleValue Val) {
318318
return hash_combine(GCR->getOpcode(), GCR->getOperand(0),
319319
GCR->getBasePtr(), GCR->getDerivedPtr());
320320

321+
// Don't CSE convergent calls in different basic blocks, because they
322+
// implicitly depend on the set of threads that is currently executing.
323+
if (CallInst *CI = dyn_cast<CallInst>(Inst); CI && CI->isConvergent()) {
324+
return hash_combine(
325+
Inst->getOpcode(), Inst->getParent(),
326+
hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
327+
}
328+
321329
// Mix in the opcode.
322330
return hash_combine(
323331
Inst->getOpcode(),
@@ -344,8 +352,16 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
344352

345353
if (LHSI->getOpcode() != RHSI->getOpcode())
346354
return false;
347-
if (LHSI->isIdenticalToWhenDefined(RHSI))
355+
if (LHSI->isIdenticalToWhenDefined(RHSI)) {
356+
// Convergent calls implicitly depend on the set of threads that is
357+
// currently executing, so conservatively return false if they are in
358+
// different basic blocks.
359+
if (CallInst *CI = dyn_cast<CallInst>(LHSI);
360+
CI && CI->isConvergent() && LHSI->getParent() != RHSI->getParent())
361+
return false;
362+
348363
return true;
364+
}
349365

350366
// If we're not strictly identical, we still might be a commutable instruction
351367
if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {

llvm/test/CodeGen/AMDGPU/cse-convergent.ll

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,25 @@ define i32 @test(i32 %val, i32 %cond) {
2121
; GCN-NEXT: s_or_saveexec_b32 s4, -1
2222
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
2323
; GCN-NEXT: s_mov_b32 exec_lo, s4
24-
; GCN-NEXT: v_mov_b32_e32 v4, 0
25-
; GCN-NEXT: v_mov_b32_e32 v0, v2
24+
; GCN-NEXT: v_mov_b32_e32 v5, 0
25+
; GCN-NEXT: v_mov_b32_e32 v4, v2
2626
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2727
; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo
2828
; GCN-NEXT: ; %bb.1: ; %if
29-
; GCN-NEXT: v_mov_b32_e32 v4, v0
29+
; GCN-NEXT: s_or_saveexec_b32 s5, -1
30+
; GCN-NEXT: v_mov_b32_e32 v2, 0
31+
; GCN-NEXT: s_mov_b32 exec_lo, s5
32+
; GCN-NEXT: v_mov_b32_e32 v3, v0
33+
; GCN-NEXT: s_not_b32 exec_lo, exec_lo
34+
; GCN-NEXT: v_mov_b32_e32 v3, 0
35+
; GCN-NEXT: s_not_b32 exec_lo, exec_lo
36+
; GCN-NEXT: s_or_saveexec_b32 s5, -1
37+
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
38+
; GCN-NEXT: s_mov_b32 exec_lo, s5
39+
; GCN-NEXT: v_mov_b32_e32 v5, v2
3040
; GCN-NEXT: ; %bb.2: ; %end
3141
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
32-
; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v4
42+
; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5
3343
; GCN-NEXT: s_xor_saveexec_b32 s4, -1
3444
; GCN-NEXT: s_clause 0x1
3545
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32

0 commit comments

Comments
 (0)