Skip to content

Commit c0cba51

Browse files
VijayKandiahVijay Kandiah
and
Vijay Kandiah
authored
[Flang] Hoisting constant-sized allocas at flang codegen. (llvm#95310)
This change modifies the `AllocaOpConversion` in flang codegen to insert constant-sized LLVM allocas at the entry block of `LLVMFuncOp` or OpenACC/OpenMP Op, rather than in-place at the `fir.alloca`. This effectively hoists constant-sized FIR allocas to the proper block. When compiling the example subroutine below with `flang-new`, we get a llvm.stacksave/stackrestore pair around a constant-sized `fir.alloca i32`. ``` subroutine test(n) block integer :: n print *, n end block end subroutine test ``` Without the proposed change, downstream LLVM compilation cannot hoist this constant-sized alloca out of the stacksave/stackrestore region which may lead to missed downstream optimizations: ``` *** IR Dump After Safe Stack instrumentation pass (safe-stack) *** define void @test_(ptr %0) !dbg !3 { %2 = call ptr @llvm.stacksave.p0(), !dbg !7 %3 = alloca i32, i64 1, align 4, !dbg !8 %4 = call ptr @_FortranAioBeginExternalListOutput(i32 6, ptr @_QQclX62c91d05f046c7a656e7978eb13f2821, i32 4), !dbg !9 %5 = load i32, ptr %3, align 4, !dbg !10, !tbaa !11 %6 = call i1 @_FortranAioOutputInteger32(ptr %4, i32 %5), !dbg !10 %7 = call i32 @_FortranAioEndIoStatement(ptr %4), !dbg !9 call void @llvm.stackrestore.p0(ptr %2), !dbg !15 ret void, !dbg !16 } ``` With this change, the `llvm.alloca` is already hoisted out of the stacksave/stackrestore region during flang codegen: ``` // -----// IR Dump After FIRToLLVMLowering (fir-to-llvm-ir) //----- // llvm.func @test_(%arg0: !llvm.ptr {fir.bindc_name = "n"}) attributes {fir.internal_name = "_QPtest"} { %0 = llvm.mlir.constant(4 : i32) : i32 %1 = llvm.mlir.constant(1 : i64) : i64 %2 = llvm.alloca %1 x i32 {bindc_name = "n"} : (i64) -> !llvm.ptr %3 = llvm.mlir.constant(6 : i32) : i32 %4 = llvm.mlir.undef : i1 %5 = llvm.call @llvm.stacksave.p0() {fastmathFlags = #llvm.fastmath<contract>} : () -> !llvm.ptr %6 = llvm.mlir.addressof @_QQclX62c91d05f046c7a656e7978eb13f2821 : !llvm.ptr %7 = llvm.call @_FortranAioBeginExternalListOutput(%3, %6, %0) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, i32) -> !llvm.ptr %8 = llvm.load %2 {tbaa = [#tbaa_tag]} : !llvm.ptr -> i32 %9 = llvm.call @_FortranAioOutputInteger32(%7, %8) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr, i32) -> i1 %10 = llvm.call @_FortranAioEndIoStatement(%7) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr) -> i32 llvm.call @llvm.stackrestore.p0(%5) {fastmathFlags = #llvm.fastmath<contract>} : (!llvm.ptr) -> () llvm.return } ``` --------- Co-authored-by: Vijay Kandiah <[email protected]>
1 parent 2f5ec13 commit c0cba51

File tree

9 files changed

+137
-117
lines changed

9 files changed

+137
-117
lines changed

flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
5151
/// appropriate reified structures.
5252
mlir::Value integerCast(mlir::Location loc,
5353
mlir::ConversionPatternRewriter &rewriter,
54-
mlir::Type ty, mlir::Value val) const;
54+
mlir::Type ty, mlir::Value val,
55+
bool fold = false) const;
56+
5557
struct TypePair {
5658
mlir::Type fir;
5759
mlir::Type llvm;
@@ -144,9 +146,12 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
144146
// Find the Block in which the alloca should be inserted.
145147
// The order to recursively find the proper block:
146148
// 1. An OpenMP Op that will be outlined.
147-
// 2. A LLVMFuncOp
148-
// 3. The first ancestor that is an OpenMP Op or a LLVMFuncOp
149-
mlir::Block *getBlockForAllocaInsert(mlir::Operation *op) const;
149+
// 2. An OpenMP or OpenACC Op with one or more regions holding executable
150+
// code.
151+
// 3. A LLVMFuncOp
152+
// 4. The first ancestor that is one of the above.
153+
mlir::Block *getBlockForAllocaInsert(mlir::Operation *op,
154+
mlir::Region *parentRegion) const;
150155

151156
// Generate an alloca of size 1 for an object of type \p llvmObjectTy in the
152157
// allocation address space provided for the architecture in the DataLayout

flang/lib/Optimizer/CodeGen/CodeGen.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
218218
chrTy.getContext(), chrTy.getFKind());
219219
llvmObjectType = convertType(rawCharTy);
220220
assert(end == 1);
221-
size = integerCast(loc, rewriter, ity, lenParams[0]);
221+
size = integerCast(loc, rewriter, ity, lenParams[0], /*fold=*/true);
222222
} else if (auto recTy = mlir::dyn_cast<fir::RecordType>(scalarType)) {
223223
mlir::LLVM::LLVMFuncOp memSizeFn =
224224
getDependentTypeMemSizeFn(recTy, alloc, rewriter);
@@ -236,17 +236,29 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
236236
}
237237
}
238238
if (auto scaleSize = genAllocationScaleSize(alloc, ity, rewriter))
239-
size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
239+
size =
240+
rewriter.createOrFold<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
240241
if (alloc.hasShapeOperands()) {
241242
unsigned end = operands.size();
242243
for (; i < end; ++i)
243-
size = rewriter.create<mlir::LLVM::MulOp>(
244-
loc, ity, size, integerCast(loc, rewriter, ity, operands[i]));
244+
size = rewriter.createOrFold<mlir::LLVM::MulOp>(
245+
loc, ity, size,
246+
integerCast(loc, rewriter, ity, operands[i], /*fold=*/true));
245247
}
246248

247249
unsigned allocaAs = getAllocaAddressSpace(rewriter);
248250
unsigned programAs = getProgramAddressSpace(rewriter);
249251

252+
if (mlir::isa<mlir::LLVM::ConstantOp>(size.getDefiningOp())) {
253+
// Set the Block in which the llvm alloca should be inserted.
254+
mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
255+
mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
256+
mlir::Block *insertBlock =
257+
getBlockForAllocaInsert(parentOp, parentRegion);
258+
size.getDefiningOp()->moveAfter(insertBlock, insertBlock->begin());
259+
rewriter.setInsertionPointAfter(size.getDefiningOp());
260+
}
261+
250262
// NOTE: we used to pass alloc->getAttrs() in the builder for non opaque
251263
// pointers! Only propagate pinned and bindc_name to help debugging, but
252264
// this should have no functional purpose (and passing the operand segment

flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -62,21 +62,27 @@ mlir::LLVM::ConstantOp ConvertFIRToLLVMPattern::genConstantOffset(
6262
/// to the specific target may involve some sign-extending or truncation of
6363
/// values, particularly to fit them from abstract box types to the
6464
/// appropriate reified structures.
65-
mlir::Value
66-
ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
67-
mlir::ConversionPatternRewriter &rewriter,
68-
mlir::Type ty, mlir::Value val) const {
65+
mlir::Value ConvertFIRToLLVMPattern::integerCast(
66+
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
67+
mlir::Type ty, mlir::Value val, bool fold) const {
6968
auto valTy = val.getType();
7069
// If the value was not yet lowered, lower its type so that it can
7170
// be used in getPrimitiveTypeSizeInBits.
7271
if (!mlir::isa<mlir::IntegerType>(valTy))
7372
valTy = convertType(valTy);
7473
auto toSize = mlir::LLVM::getPrimitiveTypeSizeInBits(ty);
7574
auto fromSize = mlir::LLVM::getPrimitiveTypeSizeInBits(valTy);
76-
if (toSize < fromSize)
77-
return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
78-
if (toSize > fromSize)
79-
return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
75+
if (fold) {
76+
if (toSize < fromSize)
77+
return rewriter.createOrFold<mlir::LLVM::TruncOp>(loc, ty, val);
78+
if (toSize > fromSize)
79+
return rewriter.createOrFold<mlir::LLVM::SExtOp>(loc, ty, val);
80+
} else {
81+
if (toSize < fromSize)
82+
return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
83+
if (toSize > fromSize)
84+
return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
85+
}
8086
return val;
8187
}
8288

@@ -274,16 +280,19 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
274280
// Find the Block in which the alloca should be inserted.
275281
// The order to recursively find the proper block:
276282
// 1. An OpenMP Op that will be outlined.
277-
// 2. A LLVMFuncOp
278-
// 3. The first ancestor that is an OpenMP Op or a LLVMFuncOp
279-
mlir::Block *
280-
ConvertFIRToLLVMPattern::getBlockForAllocaInsert(mlir::Operation *op) const {
283+
// 2. An OpenMP or OpenACC Op with one or more regions holding executable code.
284+
// 3. A LLVMFuncOp
285+
// 4. The first ancestor that is one of the above.
286+
mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
287+
mlir::Operation *op, mlir::Region *parentRegion) const {
281288
if (auto iface = mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
282289
return iface.getAllocaBlock();
290+
if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
291+
return recipeIface.getAllocaBlock(*parentRegion);
283292
if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
284293
return &llvmFuncOp.front();
285294

286-
return getBlockForAllocaInsert(op->getParentOp());
295+
return getBlockForAllocaInsert(op->getParentOp(), parentRegion);
287296
}
288297

289298
// Generate an alloca of size 1 for an object of type \p llvmObjectTy in the
@@ -297,16 +306,9 @@ mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
297306
mlir::ConversionPatternRewriter &rewriter) const {
298307
auto thisPt = rewriter.saveInsertionPoint();
299308
mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
300-
if (mlir::isa<mlir::omp::DeclareReductionOp>(parentOp) ||
301-
mlir::isa<mlir::omp::PrivateClauseOp>(parentOp)) {
302-
// DeclareReductionOp & PrivateClauseOp have multiple child regions. We want
303-
// to get the first block of whichever of those regions we are currently in
304-
mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
305-
rewriter.setInsertionPointToStart(&parentRegion->front());
306-
} else {
307-
mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
308-
rewriter.setInsertionPointToStart(insertBlock);
309-
}
309+
mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
310+
mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp, parentRegion);
311+
rewriter.setInsertionPointToStart(insertBlock);
310312
auto size = genI32Constant(loc, rewriter, 1);
311313
unsigned allocaAs = getAllocaAddressSpace(rewriter);
312314
unsigned programAs = getProgramAddressSpace(rewriter);

flang/test/Fir/alloc.fir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func.func @allocmem_array_of_dynchar(%l: i32) -> !fir.heap<!fir.array<3x3x!fir.c
156156

157157
// CHECK-LABEL: define ptr @alloca_dynarray_of_nonchar(
158158
// CHECK-SAME: i64 %[[extent:.*]])
159-
// CHECK: %[[prod1:.*]] = mul i64 1, %[[extent]]
159+
// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 1
160160
// CHECK: alloca [3 x i32], i64 %[[prod1]]
161161
func.func @alloca_dynarray_of_nonchar(%e: index) -> !fir.ref<!fir.array<3x?xi32>> {
162162
%1 = fir.alloca !fir.array<3x?xi32>, %e
@@ -165,7 +165,7 @@ func.func @alloca_dynarray_of_nonchar(%e: index) -> !fir.ref<!fir.array<3x?xi32>
165165

166166
// CHECK-LABEL: define ptr @alloca_dynarray_of_nonchar2(
167167
// CHECK-SAME: i64 %[[extent:.*]])
168-
// CHECK: %[[prod1:.*]] = mul i64 1, %[[extent]]
168+
// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 1
169169
// CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
170170
// CHECK: alloca i32, i64 %[[prod2]]
171171
func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref<!fir.array<?x?xi32>> {
@@ -194,7 +194,7 @@ func.func @allocmem_dynarray_of_nonchar2(%e: index) -> !fir.heap<!fir.array<?x?x
194194

195195
// CHECK-LABEL: define ptr @alloca_dynarray_of_char(
196196
// CHECK-SAME: i64 %[[extent:.*]])
197-
// CHECK: %[[prod1:.*]] = mul i64 1, %[[extent]]
197+
// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 1
198198
// CHECK: alloca [3 x [10 x i16]], i64 %[[prod1]]
199199
func.func @alloca_dynarray_of_char(%e : index) -> !fir.ref<!fir.array<3x?x!fir.char<2,10>>> {
200200
%1 = fir.alloca !fir.array<3x?x!fir.char<2,10>>, %e
@@ -203,7 +203,7 @@ func.func @alloca_dynarray_of_char(%e : index) -> !fir.ref<!fir.array<3x?x!fir.c
203203

204204
// CHECK-LABEL: define ptr @alloca_dynarray_of_char2(
205205
// CHECK-SAME: i64 %[[extent:.*]])
206-
// CHECK: %[[prod1:.*]] = mul i64 1, %[[extent]]
206+
// CHECK: %[[prod1:.*]] = mul i64 %[[extent]], 1
207207
// CHECK: %[[prod2:.*]] = mul i64 %[[prod1]], %[[extent]]
208208
// CHECK: alloca [10 x i16], i64 %[[prod2]]
209209
func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref<!fir.array<?x?x!fir.char<2,10>>> {
@@ -334,10 +334,10 @@ func.func @allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir
334334
}
335335

336336
// CHECK-LABEL: define void @alloca_unlimited_polymorphic_box
337-
// CHECK: %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
338337
// CHECK: %[[VAL_1:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i64 1
339-
// CHECK: %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
338+
// CHECK: %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
340339
// CHECK: %[[VAL_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, i64 1
340+
// CHECK: %[[VAL_2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, ptr, [1 x i64] }, i64 1
341341

342342
func.func @alloca_unlimited_polymorphic_box() {
343343
%0 = fir.alloca !fir.class<none>

flang/test/Fir/boxproc.fir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
// RUN: tco %s | FileCheck %s
22

33
// CHECK-LABEL: define void @_QPtest_proc_dummy()
4-
// CHECK: %[[VAL_0:.*]] = alloca i32, i64 1, align 4
4+
// CHECK: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
55
// CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
6+
// CHECK: %[[VAL_0:.*]] = alloca i32, i64 1, align 4
67
// CHECK: %[[VAL_2:.*]] = getelementptr { ptr }, ptr %[[VAL_1]], i32 0, i32 0
78
// CHECK: store ptr %[[VAL_0]], ptr %[[VAL_2]], align 8
89
// CHECK: store i32 1, ptr %[[VAL_0]], align 4
9-
// CHECK: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
1010
// CHECK: call void @llvm.init.trampoline(ptr %[[VAL_3]], ptr @_QFtest_proc_dummyPtest_proc_dummy_a, ptr %[[VAL_1]])
1111
// CHECK: %[[VAL_6:.*]] = call ptr @llvm.adjust.trampoline(ptr %[[VAL_3]])
1212
// CHECK: call void @_QPtest_proc_dummy_other(ptr %[[VAL_6]])
@@ -61,9 +61,10 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
6161
}
6262

6363
// CHECK-LABEL: define void @_QPtest_proc_dummy_char()
64-
// CHECK: %[[VAL_0:.*]] = alloca [40 x i8], i64 1, align 1
65-
// CHECK: %[[VAL_1:.*]] = alloca [10 x i8], i64 1, align 1
64+
// CHECK: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
6665
// CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
66+
// CHECK: %[[VAL_1:.*]] = alloca [10 x i8], i64 1, align 1
67+
// CHECK: %[[VAL_0:.*]] = alloca [40 x i8], i64 1, align 1
6768
// CHECK: %[[VAL_3:.*]] = getelementptr { { ptr, i64 } }, ptr %[[VAL_2]], i32 0, i32 0
6869
// CHECK: %[[VAL_5:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_1]], 0
6970
// CHECK: %[[VAL_6:.*]] = insertvalue { ptr, i64 } %[[VAL_5]], i64 10, 1
@@ -75,7 +76,6 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
7576
// CHECK: %[[VAL_15:.*]] = icmp sgt i64 %[[VAL_13]], 0
7677
// CHECK: %[[VAL_18:.*]] = getelementptr [10 x [1 x i8]], ptr %[[VAL_1]], i32 0, i64 %[[VAL_11]]
7778
// CHECK: store [1 x i8] c" ", ptr %[[VAL_18]], align 1
78-
// CHECK: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
7979
// CHECK: call void @llvm.init.trampoline(ptr %[[VAL_20]], ptr @_QFtest_proc_dummy_charPgen_message, ptr %[[VAL_2]])
8080
// CHECK: %[[VAL_23:.*]] = call ptr @llvm.adjust.trampoline(ptr %[[VAL_20]])
8181
// CHECK: %[[VAL_25:.*]] = insertvalue { ptr, i64 } undef, ptr %[[VAL_23]], 0

0 commit comments

Comments
 (0)