Skip to content

Commit 28452ac

Browse files
authored
[mlir][OpenMP] delayed privatisation for TASK (#114785)
This uses essentially an identical implementation to that used for ParallelOp. The private variable allocation and deallocation use shared functions to avoid code duplication. FIRSTPRIVATE variable copying uses duplicated code for now because I anticipate the implementation diverging in the near future once I store data for firstprivate variables in the task description structure. After enabling delayed privatisation for TASK in flang, one more test in the fujitsu test suite passes (I haven't looked into why).
1 parent f1f5220 commit 28452ac

File tree

3 files changed

+219
-85
lines changed

3 files changed

+219
-85
lines changed

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 168 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
259259
checkInReduction(op, result);
260260
checkMergeable(op, result);
261261
checkPriority(op, result);
262-
checkPrivate(op, result);
263262
checkUntied(op, result);
264263
})
265264
.Case([&](omp::TaskgroupOp op) {
@@ -701,9 +700,9 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
701700

702701
/// Populates `privatizations` with privatization declarations used for the
703702
/// given op.
704-
/// TODO: generalise beyond ParallelOp
703+
template <class OP>
705704
static void collectPrivatizationDecls(
706-
omp::ParallelOp op, SmallVectorImpl<omp::PrivateClauseOp> &privatizations) {
705+
OP op, SmallVectorImpl<omp::PrivateClauseOp> &privatizations) {
707706
std::optional<ArrayAttr> attr = op.getPrivateSyms();
708707
if (!attr)
709708
return;
@@ -1252,6 +1251,79 @@ static LogicalResult allocAndInitializeReductionVars(
12521251
return success();
12531252
}
12541253

1254+
/// Allocate delayed private variables. Returns the basic block which comes
1255+
/// after all of these allocations. llvm::Value * for each of these private
1256+
/// variables are populated in llvmPrivateVars.
1257+
template <class OP>
1258+
static llvm::Expected<llvm::BasicBlock *>
1259+
allocatePrivateVars(OP opInst, llvm::IRBuilderBase &builder,
1260+
LLVM::ModuleTranslation &moduleTranslation,
1261+
MutableArrayRef<BlockArgument> privateBlockArgs,
1262+
MutableArrayRef<omp::PrivateClauseOp> privateDecls,
1263+
llvm::SmallVector<llvm::Value *> &llvmPrivateVars,
1264+
const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP) {
1265+
// Allocate private vars
1266+
llvm::BranchInst *allocaTerminator =
1267+
llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
1268+
builder.SetInsertPoint(allocaTerminator);
1269+
assert(allocaTerminator->getNumSuccessors() == 1 &&
1270+
"This is an unconditional branch created by OpenMPIRBuilder");
1271+
llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
1272+
1273+
// FIXME: Some of the allocation regions do more than just allocating.
1274+
// They read from their block argument (amongst other non-alloca things).
1275+
// When OpenMPIRBuilder outlines the parallel region into a different
1276+
// function it places the loads for live in-values (such as these block
1277+
// arguments) at the end of the entry block (because the entry block is
1278+
// assumed to contain only allocas). Therefore, if we put these complicated
1279+
// alloc blocks in the entry block, these will not dominate the availability
1280+
// of the live-in values they are using. Fix this by adding a latealloc
1281+
// block after the entry block to put these in (this also helps to avoid
1282+
// mixing non-alloca code with allocas).
1283+
// Alloc regions which do not use the block argument can still be placed in
1284+
// the entry block (therefore keeping the allocas together).
1285+
llvm::BasicBlock *privAllocBlock = nullptr;
1286+
if (!privateBlockArgs.empty())
1287+
privAllocBlock = splitBB(builder, true, "omp.private.latealloc");
1288+
for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
1289+
Region &allocRegion = privateDecls[i].getAllocRegion();
1290+
1291+
// map allocation region block argument
1292+
llvm::Value *nonPrivateVar =
1293+
moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
1294+
assert(nonPrivateVar);
1295+
moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(),
1296+
nonPrivateVar);
1297+
1298+
// in-place convert the private allocation region
1299+
SmallVector<llvm::Value *, 1> phis;
1300+
if (privateDecls[i].getAllocMoldArg().getUses().empty()) {
1301+
// TODO this should use
1302+
// allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before
1303+
// the code for fetching the thread id. Not doing this for now to avoid
1304+
// test churn.
1305+
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
1306+
} else {
1307+
builder.SetInsertPoint(privAllocBlock->getTerminator());
1308+
}
1309+
if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc",
1310+
builder, moduleTranslation, &phis)))
1311+
return llvm::createStringError(
1312+
"failed to inline `alloc` region of `omp.private`");
1313+
1314+
assert(phis.size() == 1 && "expected one allocation to be yielded");
1315+
1316+
moduleTranslation.mapValue(privateBlockArgs[i], phis[0]);
1317+
llvmPrivateVars.push_back(phis[0]);
1318+
1319+
// clear alloc region block argument mapping in case it needs to be
1320+
// re-created with a different source for another use of the same
1321+
// reduction decl
1322+
moduleTranslation.forgetMapping(allocRegion);
1323+
}
1324+
return afterAllocas;
1325+
}
1326+
12551327
static LogicalResult
12561328
convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
12571329
LLVM::ModuleTranslation &moduleTranslation) {
@@ -1486,16 +1558,98 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
14861558
if (failed(checkImplementationStatus(*taskOp)))
14871559
return failure();
14881560

1489-
auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
1561+
// Collect delayed privatisation declarations
1562+
MutableArrayRef<BlockArgument> privateBlockArgs =
1563+
cast<omp::BlockArgOpenMPOpInterface>(*taskOp).getPrivateBlockArgs();
1564+
SmallVector<llvm::Value *> llvmPrivateVars;
1565+
SmallVector<omp::PrivateClauseOp> privateDecls;
1566+
llvmPrivateVars.reserve(privateBlockArgs.size());
1567+
privateDecls.reserve(privateBlockArgs.size());
1568+
collectPrivatizationDecls(taskOp, privateDecls);
1569+
1570+
auto bodyCB = [&](InsertPointTy allocaIP,
1571+
InsertPointTy codegenIP) -> llvm::Error {
14901572
// Save the alloca insertion point on ModuleTranslation stack for use in
14911573
// nested regions.
14921574
LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
14931575
moduleTranslation, allocaIP);
14941576

1577+
llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
1578+
taskOp, builder, moduleTranslation, privateBlockArgs, privateDecls,
1579+
llvmPrivateVars, allocaIP);
1580+
if (handleError(afterAllocas, *taskOp).failed())
1581+
return llvm::make_error<PreviouslyReportedError>();
1582+
1583+
// Apply copy region for firstprivate
1584+
bool needsFirstPrivate =
1585+
llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
1586+
return privOp.getDataSharingType() ==
1587+
omp::DataSharingClauseType::FirstPrivate;
1588+
});
1589+
if (needsFirstPrivate) {
1590+
// Find the end of the allocation blocks
1591+
assert(afterAllocas.get()->getSinglePredecessor());
1592+
builder.SetInsertPoint(
1593+
afterAllocas.get()->getSinglePredecessor()->getTerminator());
1594+
llvm::BasicBlock *copyBlock =
1595+
splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
1596+
builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
1597+
}
1598+
for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
1599+
if (privateDecls[i].getDataSharingType() !=
1600+
omp::DataSharingClauseType::FirstPrivate)
1601+
continue;
1602+
1603+
// copyRegion implements `lhs = rhs`
1604+
Region &copyRegion = privateDecls[i].getCopyRegion();
1605+
1606+
// map copyRegion rhs arg
1607+
llvm::Value *nonPrivateVar =
1608+
moduleTranslation.lookupValue(taskOp.getPrivateVars()[i]);
1609+
assert(nonPrivateVar);
1610+
moduleTranslation.mapValue(privateDecls[i].getCopyMoldArg(),
1611+
nonPrivateVar);
1612+
1613+
// map copyRegion lhs arg
1614+
moduleTranslation.mapValue(privateDecls[i].getCopyPrivateArg(),
1615+
llvmPrivateVars[i]);
1616+
1617+
// in-place convert copy region
1618+
builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
1619+
if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy",
1620+
builder, moduleTranslation)))
1621+
return llvm::createStringError(
1622+
"failed to inline `copy` region of an `omp.private` op in taskOp");
1623+
1624+
// ignore unused value yielded from copy region
1625+
1626+
// clear copy region block argument mapping in case it needs to be
1627+
// re-created with different source for reuse of the same reduction decl
1628+
moduleTranslation.forgetMapping(copyRegion);
1629+
}
1630+
1631+
// translate the body of the task:
14951632
builder.restoreIP(codegenIP);
1496-
return convertOmpOpRegions(taskOp.getRegion(), "omp.task.region", builder,
1497-
moduleTranslation)
1498-
.takeError();
1633+
auto continuationBlockOrError = convertOmpOpRegions(
1634+
taskOp.getRegion(), "omp.task.region", builder, moduleTranslation);
1635+
if (failed(handleError(continuationBlockOrError, *taskOp)))
1636+
return llvm::make_error<PreviouslyReportedError>();
1637+
1638+
// private variable deallocation
1639+
SmallVector<Region *> privateCleanupRegions;
1640+
llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
1641+
[](omp::PrivateClauseOp privatizer) {
1642+
return &privatizer.getDeallocRegion();
1643+
});
1644+
1645+
builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
1646+
if (failed(inlineOmpRegionCleanup(
1647+
privateCleanupRegions, llvmPrivateVars, moduleTranslation, builder,
1648+
"omp.private.dealloc", /*shouldLoadCleanupRegionArg=*/false)))
1649+
return llvm::createStringError("failed to inline `dealloc` region of an "
1650+
"`omp.private` op in an omp.task");
1651+
1652+
return llvm::Error::success();
14991653
};
15001654

15011655
SmallVector<llvm::OpenMPIRBuilder::DependData> dds;
@@ -1740,65 +1894,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
17401894

17411895
auto bodyGenCB = [&](InsertPointTy allocaIP,
17421896
InsertPointTy codeGenIP) -> llvm::Error {
1743-
// Allocate private vars
1744-
llvm::BranchInst *allocaTerminator =
1745-
llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
1746-
builder.SetInsertPoint(allocaTerminator);
1747-
assert(allocaTerminator->getNumSuccessors() == 1 &&
1748-
"This is an unconditional branch created by OpenMPIRBuilder");
1749-
llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
1750-
1751-
// FIXME: Some of the allocation regions do more than just allocating.
1752-
// They read from their block argument (amongst other non-alloca things).
1753-
// When OpenMPIRBuilder outlines the parallel region into a different
1754-
// function it places the loads for live in-values (such as these block
1755-
// arguments) at the end of the entry block (because the entry block is
1756-
// assumed to contain only allocas). Therefore, if we put these complicated
1757-
// alloc blocks in the entry block, these will not dominate the availability
1758-
// of the live-in values they are using. Fix this by adding a latealloc
1759-
// block after the entry block to put these in (this also helps to avoid
1760-
// mixing non-alloca code with allocas).
1761-
// Alloc regions which do not use the block argument can still be placed in
1762-
// the entry block (therefore keeping the allocas together).
1763-
llvm::BasicBlock *privAllocBlock = nullptr;
1764-
if (!privateBlockArgs.empty())
1765-
privAllocBlock = splitBB(builder, true, "omp.private.latealloc");
1766-
for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
1767-
Region &allocRegion = privateDecls[i].getAllocRegion();
1768-
1769-
// map allocation region block argument
1770-
llvm::Value *nonPrivateVar =
1771-
moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
1772-
assert(nonPrivateVar);
1773-
moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(),
1774-
nonPrivateVar);
1775-
1776-
// in-place convert the private allocation region
1777-
SmallVector<llvm::Value *, 1> phis;
1778-
if (privateDecls[i].getAllocMoldArg().getUses().empty()) {
1779-
// TODO this should use
1780-
// allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before
1781-
// the code for fetching the thread id. Not doing this for now to avoid
1782-
// test churn.
1783-
builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
1784-
} else {
1785-
builder.SetInsertPoint(privAllocBlock->getTerminator());
1786-
}
1787-
if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc",
1788-
builder, moduleTranslation, &phis)))
1789-
return llvm::createStringError(
1790-
"failed to inline `alloc` region of `omp.private`");
1791-
1792-
assert(phis.size() == 1 && "expected one allocation to be yielded");
1793-
1794-
moduleTranslation.mapValue(privateBlockArgs[i], phis[0]);
1795-
llvmPrivateVars.push_back(phis[0]);
1796-
1797-
// clear alloc region block argument mapping in case it needs to be
1798-
// re-created with a different source for another use of the same
1799-
// reduction decl
1800-
moduleTranslation.forgetMapping(allocRegion);
1801-
}
1897+
llvm::Expected<llvm::BasicBlock *> afterAllocas = allocatePrivateVars(
1898+
opInst, builder, moduleTranslation, privateBlockArgs, privateDecls,
1899+
llvmPrivateVars, allocaIP);
1900+
if (handleError(afterAllocas, *opInst).failed())
1901+
return llvm::make_error<PreviouslyReportedError>();
18021902

18031903
// Allocate reduction vars
18041904
DenseMap<Value, llvm::Value *> reductionVariableMap;
@@ -1824,9 +1924,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
18241924
});
18251925
if (needsFirstprivate) {
18261926
// Find the end of the allocation blocks
1827-
assert(afterAllocas->getSinglePredecessor());
1927+
assert(afterAllocas.get()->getSinglePredecessor());
18281928
builder.SetInsertPoint(
1829-
afterAllocas->getSinglePredecessor()->getTerminator());
1929+
afterAllocas.get()->getSinglePredecessor()->getTerminator());
18301930
llvm::BasicBlock *copyBlock =
18311931
splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
18321932
builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());

mlir/test/Target/LLVMIR/openmp-llvm.mlir

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2670,6 +2670,57 @@ llvm.func @par_task_(%arg0: !llvm.ptr {fir.bindc_name = "a"}) {
26702670
// CHECK: define internal void @[[parallel_outlined_fn]]
26712671
// -----
26722672

2673+
llvm.func @foo(!llvm.ptr) -> ()
2674+
llvm.func @destroy(!llvm.ptr) -> ()
2675+
2676+
omp.private {type = firstprivate} @privatizer : !llvm.ptr alloc {
2677+
^bb0(%arg0 : !llvm.ptr):
2678+
%0 = llvm.mlir.constant(1 : i64) : i64
2679+
%1 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
2680+
omp.yield(%1 : !llvm.ptr)
2681+
} copy {
2682+
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
2683+
%0 = llvm.load %arg0 : !llvm.ptr -> i32
2684+
llvm.store %0, %arg1 : i32, !llvm.ptr
2685+
omp.yield(%arg1 : !llvm.ptr)
2686+
} dealloc {
2687+
^bb0(%arg0 : !llvm.ptr):
2688+
llvm.call @destroy(%arg0) : (!llvm.ptr) -> ()
2689+
omp.yield
2690+
}
2691+
2692+
llvm.func @task(%arg0 : !llvm.ptr) {
2693+
omp.task private(@privatizer %arg0 -> %arg1 : !llvm.ptr) {
2694+
llvm.call @foo(%arg1) : (!llvm.ptr) -> ()
2695+
omp.terminator
2696+
}
2697+
llvm.return
2698+
}
2699+
// CHECK-LABEL: @task..omp_par
2700+
// CHECK: task.alloca:
2701+
// CHECK: %[[VAL_11:.*]] = load ptr, ptr %[[VAL_12:.*]], align 8
2702+
// CHECK: %[[VAL_13:.*]] = getelementptr { ptr }, ptr %[[VAL_11]], i32 0, i32 0
2703+
// CHECK: %[[VAL_14:.*]] = load ptr, ptr %[[VAL_13]], align 8
2704+
// CHECK: %[[VAL_15:.*]] = alloca i32, i64 1, align 4
2705+
// CHECK: br label %omp.private.latealloc
2706+
// CHECK: omp.private.latealloc: ; preds = %task.alloca
2707+
// CHECK: br label %omp.private.copy
2708+
// CHECK: omp.private.copy: ; preds = %omp.private.latealloc
2709+
// CHECK: %[[VAL_19:.*]] = load i32, ptr %[[VAL_14]], align 4
2710+
// CHECK: store i32 %[[VAL_19]], ptr %[[VAL_15]], align 4
2711+
// CHECK: br label %[[VAL_20:.*]]
2712+
// CHECK: task.body: ; preds = %omp.private.copy
2713+
// CHECK: br label %omp.task.region
2714+
// CHECK: omp.task.region: ; preds = %task.body
2715+
// CHECK: call void @foo(ptr %[[VAL_15]])
2716+
// CHECK: br label %omp.region.cont
2717+
// CHECK: omp.region.cont: ; preds = %omp.task.region
2718+
// CHECK: call void @destroy(ptr %[[VAL_15]])
2719+
// CHECK: br label %task.exit.exitStub
2720+
// CHECK: task.exit.exitStub: ; preds = %omp.region.cont
2721+
// CHECK: ret void
2722+
// -----
2723+
26732724
llvm.func @foo() -> ()
26742725

26752726
llvm.func @omp_taskgroup(%x: i32, %y: i32, %zaddr: !llvm.ptr) {

mlir/test/Target/LLVMIR/openmp-todo.mlir

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -469,23 +469,6 @@ llvm.func @task_priority(%x : i32) {
469469

470470
// -----
471471

472-
omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
473-
^bb0(%arg0: !llvm.ptr):
474-
%0 = llvm.mlir.constant(1 : i32) : i32
475-
%1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr
476-
omp.yield(%1 : !llvm.ptr)
477-
}
478-
llvm.func @task_private(%x : !llvm.ptr) {
479-
// expected-error@below {{not yet implemented: Unhandled clause privatization in omp.task operation}}
480-
// expected-error@below {{LLVM Translation failed for operation: omp.task}}
481-
omp.task private(@x.privatizer %x -> %arg0 : !llvm.ptr) {
482-
omp.terminator
483-
}
484-
llvm.return
485-
}
486-
487-
// -----
488-
489472
llvm.func @task_untied() {
490473
// expected-error@below {{not yet implemented: Unhandled clause untied in omp.task operation}}
491474
// expected-error@below {{LLVM Translation failed for operation: omp.task}}

0 commit comments

Comments
 (0)