Skip to content

Commit f1b05a0

Browse files
committed
[StructurizeCFG] Improve basic block ordering
StructurizeCFG linearizes the successors of branching basic block by adding Flow blocks to record the true/false path for branches and back edges. This patch reduces the number of Phi values needed to capture the control flow path by improving the basic block ordering. Previously, StructurizeCFG adds loop exit blocks outside of the loop. StructurizeCFG sets a boolean value to indicate the path taken, and all exit block live values extend to after the loop. For loops with a large number of exits blocks, this creates a huge number of values that are maintained, which increases compilation time and register pressure. This is problem especially with ASAN, which adds early exits to blocks with unreachable instructions for each instrumented check in the loop. In specific cases, this patch reduces the number of values needed after the loop by moving the exit block into the loop. This is done for blocks that have a single predecessor and single successor by moving the block to appear just after the predecessor. Differential Revision: https://reviews.llvm.org/D123231
1 parent d95c406 commit f1b05a0

File tree

3 files changed

+585
-14
lines changed

3 files changed

+585
-14
lines changed

llvm/lib/Transforms/Scalar/StructurizeCFG.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ static cl::opt<bool>
6868
cl::desc("Allow relaxed uniform region checks"),
6969
cl::init(true));
7070

71+
static cl::opt<unsigned>
72+
ReorderNodeSize("structurizecfg-node-reorder-size",
73+
cl::desc("Limit region size for reordering nodes"),
74+
cl::init(100), cl::Hidden);
75+
7176
// Definition of the complex types used in this pass.
7277

7378
using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -262,6 +267,8 @@ class StructurizeCFG {
262267

263268
void orderNodes();
264269

270+
void reorderNodes();
271+
265272
void analyzeLoops(RegionNode *N);
266273

267274
Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
@@ -420,6 +427,57 @@ void StructurizeCFG::orderNodes() {
420427
}
421428
}
422429

430+
/// Change the node ordering to decrease the range of live values, especially
431+
/// the values that capture the control flow path for branches. We do this
432+
/// by moving blocks with a single predecessor and successor to appear after
433+
/// predecessor. The motivation is to move some loop exit blocks into a loop.
434+
/// In cases where a loop has a large number of exit blocks, this reduces the
435+
/// amount of values needed across the loop boundary.
436+
void StructurizeCFG::reorderNodes() {
437+
SmallVector<RegionNode *, 8> NewOrder;
438+
DenseMap<BasicBlock *, unsigned> MoveTo;
439+
BitVector Moved(Order.size());
440+
441+
// The benefits of reordering nodes occurs for large regions.
442+
if (Order.size() <= ReorderNodeSize)
443+
return;
444+
445+
// The algorithm works with two passes over Order. The first pass identifies
446+
// the blocks to move and the position to move them to. The second pass
447+
// creates the new order based upon this information. We move blocks with
448+
// a single predecessor and successor. If there are multiple candidates then
449+
// maintain the original order.
450+
BBSet Seen;
451+
for (int I = Order.size() - 1; I >= 0; --I) {
452+
auto *BB = Order[I]->getEntry();
453+
Seen.insert(BB);
454+
auto *Pred = BB->getSinglePredecessor();
455+
auto *Succ = BB->getSingleSuccessor();
456+
// Consider only those basic blocks that have a predecessor in Order and a
457+
// successor that exits the region. The region may contain subregions that
458+
// have been structurized and are not included in Order.
459+
if (Pred && Succ && Seen.count(Pred) && Succ == ParentRegion->getExit() &&
460+
!MoveTo.count(Pred)) {
461+
MoveTo[Pred] = I;
462+
Moved.set(I);
463+
}
464+
}
465+
466+
// If no blocks have been moved then the original order is good.
467+
if (!Moved.count())
468+
return;
469+
470+
for (size_t I = 0, E = Order.size(); I < E; ++I) {
471+
auto *BB = Order[I]->getEntry();
472+
if (MoveTo.count(BB))
473+
NewOrder.push_back(Order[MoveTo[BB]]);
474+
if (!Moved[I])
475+
NewOrder.push_back(Order[I]);
476+
}
477+
478+
Order.assign(NewOrder);
479+
}
480+
423481
/// Determine the end of the loops
424482
void StructurizeCFG::analyzeLoops(RegionNode *N) {
425483
if (N->isSubRegion()) {
@@ -1081,6 +1139,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
10811139
ParentRegion = R;
10821140

10831141
orderNodes();
1142+
reorderNodes();
10841143
collectInfos();
10851144
createFlow();
10861145
insertConditions(false);

llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
32
; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
43
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
54

@@ -48,9 +47,9 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
4847
; GCN-NEXT: s_endpgm
4948
; IR-LABEL: @reduced_nested_loop_conditions(
5049
; IR-NEXT: bb:
51-
; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
50+
; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
5251
; IR-NEXT: [[MY_TMP1:%.*]] = getelementptr inbounds i64, i64 addrspace(3)* [[ARG:%.*]], i32 [[MY_TMP]]
53-
; IR-NEXT: [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]]
52+
; IR-NEXT: [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]], align 4
5453
; IR-NEXT: br label [[BB5:%.*]]
5554
; IR: bb3:
5655
; IR-NEXT: br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -84,7 +83,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
8483
; IR: bb16:
8584
; IR-NEXT: [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
8685
; IR-NEXT: [[MY_TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 [[MY_TMP17]]
87-
; IR-NEXT: [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]]
86+
; IR-NEXT: [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]], align 4
8887
; IR-NEXT: br label [[BB20]]
8988
; IR: bb20:
9089
; IR-NEXT: [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
@@ -93,6 +92,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
9392
; IR: bb23:
9493
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
9594
; IR-NEXT: ret void
95+
;
9696
bb:
9797
%my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
9898
%my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp
@@ -190,19 +190,19 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
190190
; GCN-NEXT: s_endpgm
191191
; IR-LABEL: @nested_loop_conditions(
192192
; IR-NEXT: bb:
193-
; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
193+
; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4]]
194194
; IR-NEXT: [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
195195
; IR-NEXT: [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
196196
; IR-NEXT: [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
197197
; IR-NEXT: [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
198198
; IR-NEXT: [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
199-
; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
199+
; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
200200
; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
201201
; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
202202
; IR: bb14.lr.ph:
203203
; IR-NEXT: br label [[BB14:%.*]]
204204
; IR: Flow3:
205-
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
205+
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP20:%.*]])
206206
; IR-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
207207
; IR-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
208208
; IR-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
@@ -244,7 +244,7 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
244244
; IR-NEXT: [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
245245
; IR-NEXT: br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
246246
; IR: bb18:
247-
; IR-NEXT: [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef
247+
; IR-NEXT: [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
248248
; IR-NEXT: [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
249249
; IR-NEXT: br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
250250
; IR: bb21:
@@ -261,21 +261,22 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
261261
; IR-NEXT: [[MY_TMP8:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 [[MY_TMP7]]
262262
; IR-NEXT: [[MY_TMP9]] = load <4 x i32>, <4 x i32> addrspace(1)* [[MY_TMP8]], align 16
263263
; IR-NEXT: [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
264-
; IR-NEXT: [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef
264+
; IR-NEXT: [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
265265
; IR-NEXT: [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
266266
; IR-NEXT: br label [[FLOW1]]
267267
; IR: Flow2:
268268
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
269-
; IR-NEXT: [[TMP19:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
270-
; IR-NEXT: [[TMP20:%.*]] = extractvalue { i1, i64 } [[TMP19]], 0
271-
; IR-NEXT: [[TMP21]] = extractvalue { i1, i64 } [[TMP19]], 1
272-
; IR-NEXT: br i1 [[TMP20]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
269+
; IR-NEXT: [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
270+
; IR-NEXT: [[TMP19:%.*]] = extractvalue { i1, i64 } [[TMP18]], 0
271+
; IR-NEXT: [[TMP20]] = extractvalue { i1, i64 } [[TMP18]], 1
272+
; IR-NEXT: br i1 [[TMP19]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
273273
; IR: bb31.loopexit:
274274
; IR-NEXT: br label [[FLOW3]]
275275
; IR: bb31:
276276
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
277-
; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
277+
; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef, align 4
278278
; IR-NEXT: ret void
279+
;
279280
bb:
280281
%my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
281282
%my.tmp1 = zext i32 %my.tmp to i64

0 commit comments

Comments
 (0)