Skip to content

Commit 7481b46

Browse files
committed
[OpenMP] Use default grid value for static grid size
If the user did not provide any static clause to override the grid size, we assume the default grid size as upper bound and use it to improve code generation through vendor specific attributes. Fixes: llvm#64816 Differential Revision: https://reviews.llvm.org/D158382
1 parent c5488c8 commit 7481b46

File tree

2 files changed

+122
-0
lines changed

2 files changed

+122
-0
lines changed

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "llvm/Analysis/ScalarEvolution.h"
2424
#include "llvm/Analysis/TargetLibraryInfo.h"
2525
#include "llvm/Bitcode/BitcodeReader.h"
26+
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
2627
#include "llvm/IR/Attributes.h"
2728
#include "llvm/IR/CFG.h"
2829
#include "llvm/IR/CallingConv.h"
@@ -37,6 +38,7 @@
3738
#include "llvm/IR/Value.h"
3839
#include "llvm/MC/TargetRegistry.h"
3940
#include "llvm/Support/CommandLine.h"
41+
#include "llvm/Support/ErrorHandling.h"
4042
#include "llvm/Support/FileSystem.h"
4143
#include "llvm/Target/TargetMachine.h"
4244
#include "llvm/Target/TargetOptions.h"
@@ -4121,6 +4123,20 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
41214123
Builder.CreateCall(Fn, {});
41224124
}
41234125

4126+
static const omp::GV &getGridValue(Function *Kernel) {
4127+
if (Kernel->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
4128+
StringRef Features =
4129+
Kernel->getFnAttribute("target-features").getValueAsString();
4130+
if (Features.count("+wavefrontsize64"))
4131+
return omp::getAMDGPUGridValues<64>();
4132+
return omp::getAMDGPUGridValues<32>();
4133+
}
4134+
if (Triple(Kernel->getParent()->getTargetTriple()).isNVPTX())
4135+
4136+
return omp::NVPTXGridValues;
4137+
llvm_unreachable("No grid value available for this architecture!");
4138+
}
4139+
41244140
void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
41254141
Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) {
41264142
if (Config.isTargetDevice()) {
@@ -4135,6 +4151,9 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
41354151
if (NumTeams > 0)
41364152
OutlinedFn->addFnAttr("omp_target_num_teams", std::to_string(NumTeams));
41374153

4154+
if (NumThreads == -1 && Config.isGPU())
4155+
NumThreads = getGridValue(OutlinedFn).GV_Default_WG_Size;
4156+
41384157
if (NumThreads > 0) {
41394158
if (OutlinedFn->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
41404159
OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// clang-format off
2+
// RUN: %libomptarget-compile-generic
3+
// RUN: env LIBOMPTARGET_INFO=16 \
4+
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
5+
6+
// UNSUPPORTED: nvptx64-nvidia-cuda
7+
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
8+
// UNSUPPORTED: aarch64-unknown-linux-gnu
9+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
10+
// UNSUPPORTED: x86_64-pc-linux-gnu
11+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
12+
13+
__attribute__((optnone)) int optnone() { return 1; }
14+
15+
int main() {
16+
int N = optnone() * 4098 * 32;
17+
18+
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
19+
#pragma omp target teams distribute parallel for simd
20+
for (int i = 0; i < N; ++i) {
21+
optnone();
22+
}
23+
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
24+
#pragma omp target teams distribute parallel for simd
25+
for (int i = 0; i < N; ++i) {
26+
optnone();
27+
}
28+
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
29+
#pragma omp target teams distribute parallel for simd
30+
for (int i = 0; i < N; ++i) {
31+
optnone();
32+
}
33+
// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
34+
#pragma omp target
35+
#pragma omp teams distribute parallel for
36+
for (int i = 0; i < N; ++i) {
37+
optnone();
38+
}
39+
// DEFAULT: 42 (MaxFlatWorkGroupSize: 1024
40+
#pragma omp target thread_limit(optnone() * 42)
41+
#pragma omp teams distribute parallel for
42+
for (int i = 0; i < N; ++i) {
43+
optnone();
44+
}
45+
// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
46+
#pragma omp target thread_limit(optnone() * 42) ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
47+
#pragma omp teams distribute parallel for
48+
for (int i = 0; i < N; ++i) {
49+
optnone();
50+
}
51+
// FIXME: Use the attribute value to imply a thread_limit
52+
// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
53+
#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
54+
#pragma omp teams distribute parallel for
55+
for (int i = 0; i < N; ++i) {
56+
optnone();
57+
}
58+
// DEFAULT: MaxFlatWorkGroupSize: 1024
59+
#pragma omp target
60+
#pragma omp teams distribute parallel for num_threads(optnone() * 42)
61+
for (int i = 0; i < N; ++i) {
62+
optnone();
63+
}
64+
// DEFAULT: MaxFlatWorkGroupSize: 1024
65+
#pragma omp target teams distribute parallel for thread_limit(optnone() * 42)
66+
for (int i = 0; i < N; ++i) {
67+
optnone();
68+
}
69+
// DEFAULT: MaxFlatWorkGroupSize: 1024
70+
#pragma omp target teams distribute parallel for num_threads(optnone() * 42)
71+
for (int i = 0; i < N; ++i) {
72+
optnone();
73+
}
74+
// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
75+
#pragma omp target
76+
#pragma omp teams distribute parallel for num_threads(9)
77+
for (int i = 0; i < N; ++i) {
78+
optnone();
79+
}
80+
// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
81+
#pragma omp target thread_limit(4)
82+
#pragma omp teams distribute parallel for
83+
for (int i = 0; i < N; ++i) {
84+
optnone();
85+
}
86+
// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
87+
#pragma omp target
88+
#pragma omp teams distribute parallel for thread_limit(4)
89+
for (int i = 0; i < N; ++i) {
90+
optnone();
91+
}
92+
// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
93+
#pragma omp target teams distribute parallel for num_threads(9)
94+
for (int i = 0; i < N; ++i) {
95+
optnone();
96+
}
97+
// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
98+
#pragma omp target teams distribute parallel for simd thread_limit(4)
99+
for (int i = 0; i < N; ++i) {
100+
optnone();
101+
}
102+
}
103+

0 commit comments

Comments
 (0)