Description
#include <iostream>
#include <vector>
#define N 4
int main()
{
std::vector<int> avec(N);
int* a = avec.data();
#pragma omp parallel for
for(int i = 0; i < N; i++)
{
a[i] = 0;
#pragma omp target teams distribute parallel for reduction(+:a[i])
for(int j =0 ;j< N; j++)
a[i] += 1;
}
std::cout << "results:";
for(int i = 0; i < N; i++)
std::cout << " " << a[i];
std::cout << std::endl;
}
running clang++ -fopenmp --offload-arch=sm_80 main.cpp && ./a.out
expect
results: 4 4 4 4
but I got random failure
results: 0 4 0 0
results: 4 0 4 0
results: 4 0 4 0
Turning on debugging info
OMP_TARGET_OFFLOAD=mandatory OMP_NUM_THREADS=32 LIBOMPTARGET_DEBUG=1 ./a.out >& out && grep "Moving 4 bytes\|result" out
Libomptarget --> Moving 4 bytes (hst:0x00005572a0d9c27c) -> (tgt:0x00007f7112600000)
Libomptarget --> Moving 4 bytes (hst:0x00005572a0d9c278) -> (tgt:0x00007f7112600200)
Libomptarget --> Moving 4 bytes (hst:0x00005572a0d9c274) -> (tgt:0x00007f7112600400)
Libomptarget --> Moving 4 bytes (hst:0x00005572a0d9c270) -> (tgt:0x00007f7112600600)
Libomptarget --> Moving 4 bytes (tgt:0x00007f7112600000) -> (hst:0x00005572a0d9c27c)
Libomptarget --> Moving 4 bytes (tgt:0x00007f7112600400) -> (hst:0x00005572a0d9c274)
Libomptarget --> Moving 4 bytes (tgt:0x00007f7112600600) -> (hst:0x00005572a0d9c270)
Libomptarget --> Moving 4 bytes (tgt:0x00007f7112600200) -> (hst:0x00005572a0d9c278)
results: 4 0 4 0
Mapping and transfers seem OK to me. The failure was miserable.
setting OMP_NUM_THREADS=1
the test passes reliably.