Skip to content

Commit c9eb1ff

Browse files
authored
[OpenACC][CIR] Implement 'async' lowering. (#136626)
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type.
1 parent 616e8cc commit c9eb1ff

File tree

6 files changed

+221
-10
lines changed

6 files changed

+221
-10
lines changed

clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,17 @@ class OpenACCClauseCIREmitter final
9797

9898
// Handle a clause affected by the 'device-type' to the point that they need
9999
// to have the attributes added in the correct/corresponding order, such as
100-
// 'num_workers' or 'vector_length' on a compute construct.
101-
mlir::ArrayAttr
102-
handleDeviceTypeAffectedClause(mlir::ArrayAttr existingDeviceTypes,
103-
mlir::Value argument,
104-
mlir::MutableOperandRange &argCollection) {
100+
// 'num_workers' or 'vector_length' on a compute construct. For cases where we
101+
// don't have an expression 'argument' that needs to be added to an operand
102+
// and only care about the 'device-type' list, we can use this with 'argument'
103+
// as 'std::nullopt'. If 'argument' is NOT 'std::nullopt' (that is, has a
104+
// value), argCollection must also be non-null. For cases where we don't have
105+
// an argument that needs to be added to an additional one (such as asyncOnly)
106+
// we can use this with 'argument' as std::nullopt.
107+
mlir::ArrayAttr handleDeviceTypeAffectedClause(
108+
mlir::ArrayAttr existingDeviceTypes,
109+
std::optional<mlir::Value> argument = std::nullopt,
110+
mlir::MutableOperandRange *argCollection = nullptr) {
105111
llvm::SmallVector<mlir::Attribute> deviceTypes;
106112

107113
// Collect the 'existing' device-type attributes so we can re-create them
@@ -120,13 +126,19 @@ class OpenACCClauseCIREmitter final
120126
lastDeviceTypeClause->getArchitectures()) {
121127
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
122128
builder.getContext(), decodeDeviceType(arch.getIdentifierInfo())));
123-
argCollection.append(argument);
129+
if (argument) {
130+
assert(argCollection);
131+
argCollection->append(*argument);
132+
}
124133
}
125134
} else {
126135
// Else, we just add a single for 'none'.
127136
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
128137
builder.getContext(), mlir::acc::DeviceType::None));
129-
argCollection.append(argument);
138+
if (argument) {
139+
assert(argCollection);
140+
argCollection->append(*argument);
141+
}
130142
}
131143

132144
return mlir::ArrayAttr::get(builder.getContext(), deviceTypes);
@@ -205,7 +217,7 @@ class OpenACCClauseCIREmitter final
205217
mlir::MutableOperandRange range = operation.getNumWorkersMutable();
206218
operation.setNumWorkersDeviceTypeAttr(handleDeviceTypeAffectedClause(
207219
operation.getNumWorkersDeviceTypeAttr(),
208-
createIntExpr(clause.getIntExpr()), range));
220+
createIntExpr(clause.getIntExpr()), &range));
209221
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
210222
llvm_unreachable("num_workers not valid on serial");
211223
} else {
@@ -218,14 +230,30 @@ class OpenACCClauseCIREmitter final
218230
mlir::MutableOperandRange range = operation.getVectorLengthMutable();
219231
operation.setVectorLengthDeviceTypeAttr(handleDeviceTypeAffectedClause(
220232
operation.getVectorLengthDeviceTypeAttr(),
221-
createIntExpr(clause.getIntExpr()), range));
233+
createIntExpr(clause.getIntExpr()), &range));
222234
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
223235
llvm_unreachable("vector_length not valid on serial");
224236
} else {
225237
return clauseNotImplemented(clause);
226238
}
227239
}
228240

241+
void VisitAsyncClause(const OpenACCAsyncClause &clause) {
242+
if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
243+
if (!clause.hasIntExpr()) {
244+
operation.setAsyncOnlyAttr(
245+
handleDeviceTypeAffectedClause(operation.getAsyncOnlyAttr()));
246+
} else {
247+
mlir::MutableOperandRange range = operation.getAsyncOperandsMutable();
248+
operation.setAsyncOperandsDeviceTypeAttr(handleDeviceTypeAffectedClause(
249+
operation.getAsyncOperandsDeviceTypeAttr(),
250+
createIntExpr(clause.getIntExpr()), &range));
251+
}
252+
} else {
253+
return clauseNotImplemented(clause);
254+
}
255+
}
256+
229257
void VisitSelfClause(const OpenACCSelfClause &clause) {
230258
if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
231259
if (clause.isEmptySelfClause()) {

clang/lib/Sema/SemaOpenACCClause.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ class SemaOpenACCClauseVisitor {
334334
}
335335

336336
// For 'tile' and 'collapse', only allow 1 per 'device_type'.
337-
// Also applies to num_worker, num_gangs, and vector_length.
337+
// Also applies to num_worker, num_gangs, vector_length, and async.
338338
template <typename TheClauseTy>
339339
bool DisallowSinceLastDeviceType(SemaOpenACC::OpenACCParsedClause &Clause) {
340340
auto LastDeviceTypeItr =
@@ -639,6 +639,9 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
639639

640640
OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
641641
SemaOpenACC::OpenACCParsedClause &Clause) {
642+
if (DisallowSinceLastDeviceType<OpenACCAsyncClause>(Clause))
643+
return nullptr;
644+
642645
assert(Clause.getNumIntExprs() < 2 &&
643646
"Invalid number of expressions for Async");
644647
return OpenACCAsyncClause::Create(

clang/test/CIR/CodeGenOpenACC/kernels.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,5 +210,51 @@ void acc_kernels(int cond) {
210210
// CHECK-NEXT: acc.terminator
211211
// CHECK-NEXT: } loc
212212

213+
#pragma acc kernels async
214+
{}
215+
// CHECK-NEXT: acc.kernels {
216+
// CHECK-NEXT: acc.terminator
217+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
218+
219+
#pragma acc kernels async(cond)
220+
{}
221+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
222+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
223+
// CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32) {
224+
// CHECK-NEXT: acc.terminator
225+
// CHECK-NEXT: } loc
226+
227+
#pragma acc kernels async device_type(nvidia, radeon) async
228+
{}
229+
// CHECK-NEXT: acc.kernels {
230+
// CHECK-NEXT: acc.terminator
231+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
232+
233+
#pragma acc kernels async(3) device_type(nvidia, radeon) async(cond)
234+
{}
235+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
236+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
237+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
238+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
239+
// CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
240+
// CHECK-NEXT: acc.terminator
241+
// CHECK-NEXT: } loc
242+
243+
#pragma acc kernels async device_type(nvidia, radeon) async(cond)
244+
{}
245+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
246+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
247+
// CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
248+
// CHECK-NEXT: acc.terminator
249+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
250+
251+
#pragma acc kernels async(3) device_type(nvidia, radeon) async
252+
{}
253+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
254+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
255+
// CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) {
256+
// CHECK-NEXT: acc.terminator
257+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
258+
213259
// CHECK-NEXT: cir.return
214260
}

clang/test/CIR/CodeGenOpenACC/parallel.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,5 +209,51 @@ void acc_parallel(int cond) {
209209
// CHECK-NEXT: acc.yield
210210
// CHECK-NEXT: } loc
211211

212+
#pragma acc parallel async
213+
{}
214+
// CHECK-NEXT: acc.parallel {
215+
// CHECK-NEXT: acc.yield
216+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
217+
218+
#pragma acc parallel async(cond)
219+
{}
220+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
221+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
222+
// CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32) {
223+
// CHECK-NEXT: acc.yield
224+
// CHECK-NEXT: } loc
225+
226+
#pragma acc parallel async device_type(nvidia, radeon) async
227+
{}
228+
// CHECK-NEXT: acc.parallel {
229+
// CHECK-NEXT: acc.yield
230+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
231+
232+
#pragma acc parallel async(3) device_type(nvidia, radeon) async(cond)
233+
{}
234+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
235+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
236+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
237+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
238+
// CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
239+
// CHECK-NEXT: acc.yield
240+
// CHECK-NEXT: } loc
241+
242+
#pragma acc parallel async device_type(nvidia, radeon) async(cond)
243+
{}
244+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
245+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
246+
// CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
247+
// CHECK-NEXT: acc.yield
248+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
249+
250+
#pragma acc parallel async(3) device_type(nvidia, radeon) async
251+
{}
252+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
253+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
254+
// CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) {
255+
// CHECK-NEXT: acc.yield
256+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
257+
212258
// CHECK-NEXT: cir.return
213259
}

clang/test/CIR/CodeGenOpenACC/serial.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,5 +106,51 @@ void acc_serial(int cond) {
106106
// CHECK-NEXT: acc.yield
107107
// CHECK-NEXT: } loc
108108

109+
#pragma acc serial async
110+
{}
111+
// CHECK-NEXT: acc.serial {
112+
// CHECK-NEXT: acc.yield
113+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
114+
115+
#pragma acc serial async(cond)
116+
{}
117+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
118+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
119+
// CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32) {
120+
// CHECK-NEXT: acc.yield
121+
// CHECK-NEXT: } loc
122+
123+
#pragma acc serial async device_type(nvidia, radeon) async
124+
{}
125+
// CHECK-NEXT: acc.serial {
126+
// CHECK-NEXT: acc.yield
127+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
128+
129+
#pragma acc serial async(3) device_type(nvidia, radeon) async(cond)
130+
{}
131+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
132+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
133+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
134+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
135+
// CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
136+
// CHECK-NEXT: acc.yield
137+
// CHECK-NEXT: } loc
138+
139+
#pragma acc serial async device_type(nvidia, radeon) async(cond)
140+
{}
141+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
142+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
143+
// CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
144+
// CHECK-NEXT: acc.yield
145+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
146+
147+
#pragma acc serial async(3) device_type(nvidia, radeon) async
148+
{}
149+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
150+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
151+
// CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) {
152+
// CHECK-NEXT: acc.yield
153+
// CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
154+
109155
// CHECK-NEXT: cir.return
110156
}

clang/test/SemaOpenACC/compute-construct-async-clause.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,48 @@ void Test() {
2020
#pragma acc serial async(1, 2)
2121
while(1);
2222

23+
// expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
24+
// expected-note@+1{{previous clause is here}}
25+
#pragma acc kernels async async
26+
while(1);
27+
28+
// expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
29+
// expected-note@+1{{previous clause is here}}
30+
#pragma acc kernels async(1) async(2)
31+
while(1);
32+
33+
// expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'parallel' directive}}
34+
// expected-note@+1{{previous clause is here}}
35+
#pragma acc parallel async(1) async(2)
36+
while(1);
37+
38+
// expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'serial' directive}}
39+
// expected-note@+1{{previous clause is here}}
40+
#pragma acc serial async(1) async(2)
41+
while(1);
42+
43+
// expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'kernels' directive}}
44+
// expected-note@+2{{previous clause is here}}
45+
// expected-note@+1{{previous clause is here}}
46+
#pragma acc kernels async(1) device_type(*) async(1) async(2)
47+
while(1);
48+
// expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
49+
// expected-note@+2{{previous clause is here}}
50+
// expected-note@+1{{previous clause is here}}
51+
#pragma acc parallel async device_type(*) async async
52+
while(1);
53+
// expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'serial' directive}}
54+
// expected-note@+2{{previous clause is here}}
55+
// expected-note@+1{{previous clause is here}}
56+
#pragma acc serial async(1) device_type(*) async async(2)
57+
while(1);
58+
59+
// expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
60+
// expected-note@+2{{previous clause is here}}
61+
// expected-note@+1{{previous clause is here}}
62+
#pragma acc parallel device_type(*) async async
63+
while(1);
64+
2365
struct NotConvertible{} NC;
2466
// expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
2567
#pragma acc parallel async(NC)

0 commit comments

Comments
 (0)