[StructurizeCFG] Improve basic block ordering

bcahoon · bcahoon · commit f1b05a0a2bbb · 2022-06-22T16:10:41.000-05:00
StructurizeCFG linearizes the successors of branching basic block by adding Flow blocks to record the true/false path for branches and back edges. This patch reduces the number of Phi values needed to capture the control flow path by improving the basic block ordering. Previously, StructurizeCFG adds loop exit blocks outside of the loop. StructurizeCFG sets a boolean value to indicate the path taken, and all exit block live values extend to after the loop. For loops with a large number of exits blocks, this creates a huge number of values that are maintained, which increases compilation time and register pressure. This is problem especially with ASAN, which adds early exits to blocks with unreachable instructions for each instrumented check in the loop. In specific cases, this patch reduces the number of values needed after the loop by moving the exit block into the loop. This is done for blocks that have a single predecessor and single successor by moving the block to appear just after the predecessor. Differential Revision: https://reviews.llvm.org/D123231
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -68,6 +68,11 @@ static cl::opt<bool>
                           cl::desc("Allow relaxed uniform region checks"),
                           cl::init(true));
 
+static cl::opt<unsigned>
+    ReorderNodeSize("structurizecfg-node-reorder-size",
+                     cl::desc("Limit region size for reordering nodes"),
+                     cl::init(100), cl::Hidden);
+
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -262,6 +267,8 @@ class StructurizeCFG {
 
   void orderNodes();
 
+  void reorderNodes();
+
   void analyzeLoops(RegionNode *N);
 
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
@@ -420,6 +427,57 @@ void StructurizeCFG::orderNodes() {
   }
 }
 
+/// Change the node ordering to decrease the range of live values, especially
+/// the values that capture the control flow path for branches. We do this
+/// by moving blocks with a single predecessor and successor to appear after
+/// predecessor. The motivation is to move some loop exit blocks into a loop.
+/// In cases where a loop has a large number of exit blocks, this reduces the
+/// amount of values needed across the loop boundary.
+void StructurizeCFG::reorderNodes() {
+  SmallVector<RegionNode *, 8> NewOrder;
+  DenseMap<BasicBlock *, unsigned> MoveTo;
+  BitVector Moved(Order.size());
+
+  // The benefits of reordering nodes occurs for large regions.
+  if (Order.size() <= ReorderNodeSize)
+    return;
+
+  // The algorithm works with two passes over Order. The first pass identifies
+  // the blocks to move and the position to move them to. The second pass
+  // creates the new order based upon this information. We move blocks with
+  // a single predecessor and successor. If there are multiple candidates then
+  // maintain the original order.
+  BBSet Seen;
+  for (int I = Order.size() - 1; I >= 0; --I) {
+    auto *BB = Order[I]->getEntry();
+    Seen.insert(BB);
+    auto *Pred = BB->getSinglePredecessor();
+    auto *Succ = BB->getSingleSuccessor();
+    // Consider only those basic blocks that have a predecessor in Order and a
+    // successor that exits the region. The region may contain subregions that
+    // have been structurized and are not included in Order.
+    if (Pred && Succ && Seen.count(Pred) && Succ == ParentRegion->getExit() &&
+        !MoveTo.count(Pred)) {
+      MoveTo[Pred] = I;
+      Moved.set(I);
+    }
+  }
+
+  // If no blocks have been moved then the original order is good.
+  if (!Moved.count())
+    return;
+
+  for (size_t I = 0, E = Order.size(); I < E; ++I) {
+    auto *BB = Order[I]->getEntry();
+    if (MoveTo.count(BB))
+      NewOrder.push_back(Order[MoveTo[BB]]);
+    if (!Moved[I])
+      NewOrder.push_back(Order[I]);
+  }
+
+  Order.assign(NewOrder);
+}
+
 /// Determine the end of the loops
 void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
@@ -1081,6 +1139,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   ParentRegion = R;
 
   orderNodes();
+  reorderNodes();
   collectInfos();
   createFlow();
   insertConditions(false);
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -48,9 +47,9 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @reduced_nested_loop_conditions(
 ; IR-NEXT:  bb:
-; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
+; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, i64 addrspace(3)* [[ARG:%.*]], i32 [[MY_TMP]]
-; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]]
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]], align 4
 ; IR-NEXT:    br label [[BB5:%.*]]
 ; IR:       bb3:
 ; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -84,7 +83,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR:       bb16:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
 ; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 [[MY_TMP17]]
-; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]]
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]], align 4
 ; IR-NEXT:    br label [[BB20]]
 ; IR:       bb20:
 ; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
@@ -93,6 +92,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR:       bb23:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp
@@ -190,19 +190,19 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @nested_loop_conditions(
 ; IR-NEXT:  bb:
-; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
+; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
 ; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
 ; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
 ; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
 ; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
 ; IR:       bb14.lr.ph:
 ; IR-NEXT:    br label [[BB14:%.*]]
 ; IR:       Flow3:
-; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP20:%.*]])
 ; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
@@ -244,7 +244,7 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
 ; IR-NEXT:    br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
 ; IR:       bb18:
-; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
 ; IR-NEXT:    br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
 ; IR:       bb21:
@@ -261,21 +261,22 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    [[MY_TMP8:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 [[MY_TMP7]]
 ; IR-NEXT:    [[MY_TMP9]] = load <4 x i32>, <4 x i32> addrspace(1)* [[MY_TMP8]], align 16
 ; IR-NEXT:    [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
-; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
 ; IR-NEXT:    br label [[FLOW1]]
 ; IR:       Flow2:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
-; IR-NEXT:    [[TMP19:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
-; IR-NEXT:    [[TMP20:%.*]] = extractvalue { i1, i64 } [[TMP19]], 0
-; IR-NEXT:    [[TMP21]] = extractvalue { i1, i64 } [[TMP19]], 1
-; IR-NEXT:    br i1 [[TMP20]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
+; IR-NEXT:    [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
+; IR-NEXT:    [[TMP19:%.*]] = extractvalue { i1, i64 } [[TMP18]], 0
+; IR-NEXT:    [[TMP20]] = extractvalue { i1, i64 } [[TMP18]], 1
+; IR-NEXT:    br i1 [[TMP19]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
 ; IR:       bb31.loopexit:
 ; IR-NEXT:    br label [[FLOW3]]
 ; IR:       bb31:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
-; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef
+; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = zext i32 %my.tmp to i64
diff --git a/llvm/test/Transforms/StructurizeCFG/improve-order.ll b/llvm/test/Transforms/StructurizeCFG/improve-order.ll