Skip to content

Commit f8c968f

Browse files
committed
[TSan][OpenMP][Archer] Treat all reduction operations as atomic
This patch rebases https://reviews.llvm.org/D108046 to the new ThreadSanitizer runtime. The idea of the new ThreadSanitizer Annotation function is to promote all memory accesses to be treated and logged as they would be explicit atomic accesses. I used the performance benchmark from the initial fiber review (https://reviews.llvm.org/D54889#1343582). The TSan-specific changes of this PR increase the execution time from 8.37 to 8.52 seconds on my system, which is a 1.8% runtime increase. The current tests for this new Annotation are integrated into the tests for the OpenMP-specific tool Archer. The new Annotations are used in Archer to promote all memory accesses performed to implement an OpenMP reduction as being atomic accesses. With these changes, ThreadSanitizer+Archer successfully detect the race in openmp/tools/archer/tests/races/parallel-for-array-reduction-no-barrier.c. The challenge in this test is to detect the race between the memory access from the primary thread before the reduction (line 30), which is not synchronized with the OpenMP reduction (line 31). The OpenMP CodeGen generates three different code patterns for the reduction from which the OpenMP runtime chooses one implementation at runtime. The new analysis is only compatible with two of these code patterns, therefore we skip generation of the third code pattern, if the TSan flag is present during compilation. Under review as llvm#74631
1 parent 49e6e3b commit f8c968f

14 files changed

+333
-19
lines changed

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5019,13 +5019,16 @@ llvm::Function *CGOpenMPRuntime::emitReductionFunction(
50195019
Args.push_back(&RHSArg);
50205020
const auto &CGFI =
50215021
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
5022+
CodeGenFunction CGF(CGM);
50225023
std::string Name = getReductionFuncName(ReducerName);
50235024
auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
50245025
llvm::GlobalValue::InternalLinkage, Name,
50255026
&CGM.getModule());
5027+
if (CGF.SanOpts.has(SanitizerKind::Thread)) {
5028+
return Fn;
5029+
}
50265030
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
50275031
Fn->setDoesNotRecurse();
5028-
CodeGenFunction CGF(CGM);
50295032
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
50305033

50315034
// Dst = (void*[n])(LHSArg);
@@ -5217,6 +5220,11 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
52175220
llvm::Function *ReductionFn = emitReductionFunction(
52185221
CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
52195222
Privates, LHSExprs, RHSExprs, ReductionOps);
5223+
llvm::Value *ReductionFnP = ReductionFn;
5224+
if (CGF.SanOpts.has(SanitizerKind::Thread)) {
5225+
ReductionFnP = llvm::ConstantPointerNull::get(
5226+
llvm::PointerType::get(ReductionFn->getFunctionType(), 0));
5227+
}
52205228

52215229
// 3. Create static kmp_critical_name lock = { 0 };
52225230
std::string Name = getName({"reduction"});
@@ -5235,8 +5243,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
52355243
CGF.Builder.getInt32(RHSExprs.size()), // i32 <n>
52365244
ReductionArrayTySize, // size_type sizeof(RedList)
52375245
RL, // void *RedList
5238-
ReductionFn, // void (*) (void *, void *) <reduce_func>
5239-
Lock // kmp_critical_name *&<lock>
5246+
ReductionFnP, // void (*) (void *, void *) <reduce_func>
5247+
Lock // kmp_critical_name *&<lock>
52405248
};
52415249
llvm::Value *Res = CGF.EmitRuntimeCall(
52425250
OMPBuilder.getOrCreateRuntimeFunction(

compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,16 @@ void INTERFACE_ATTRIBUTE AnnotateBenignRace(
266266
BenignRaceImpl(f, l, mem, 1, desc);
267267
}
268268

269+
void INTERFACE_ATTRIBUTE AnnotateAllAtomicBegin(char *f, int l) {
270+
SCOPED_ANNOTATION(AnnotateAllAtomicBegin);
271+
ThreadAtomicBegin(thr, pc);
272+
}
273+
274+
void INTERFACE_ATTRIBUTE AnnotateAllAtomicEnd(char *f, int l) {
275+
SCOPED_ANNOTATION(AnnotateAllAtomicEnd);
276+
ThreadAtomicEnd(thr);
277+
}
278+
269279
void INTERFACE_ATTRIBUTE AnnotateIgnoreReadsBegin(char *f, int l) {
270280
SCOPED_ANNOTATION(AnnotateIgnoreReadsBegin);
271281
ThreadIgnoreBegin(thr, pc);

compiler-rt/lib/tsan/rtl/tsan_rtl.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,6 +1053,21 @@ void ThreadIgnoreEnd(ThreadState *thr) {
10531053
}
10541054
}
10551055

1056+
void ThreadAtomicBegin(ThreadState* thr, uptr pc) {
1057+
thr->all_atomic++;
1058+
// CHECK_GT(thr->ignore_reads_and_writes, 0);
1059+
CHECK_EQ(thr->all_atomic, 1);
1060+
thr->fast_state.SetAtomicBit();
1061+
}
1062+
1063+
void ThreadAtomicEnd(ThreadState *thr) {
1064+
CHECK_GT(thr->all_atomic, 0);
1065+
thr->all_atomic--;
1066+
if (thr->all_atomic == 0) {
1067+
thr->fast_state.ClearAtomicBit();
1068+
}
1069+
}
1070+
10561071
#if !SANITIZER_GO
10571072
extern "C" SANITIZER_INTERFACE_ATTRIBUTE
10581073
uptr __tsan_testonly_shadow_stack_current_size() {

compiler-rt/lib/tsan/rtl/tsan_rtl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ struct ThreadState {
182182
// for better performance.
183183
int ignore_reads_and_writes;
184184
int suppress_reports;
185+
int all_atomic;
185186
// Go does not support ignores.
186187
#if !SANITIZER_GO
187188
IgnoreSet mop_ignore_set;
@@ -550,6 +551,8 @@ void MemoryRangeImitateWrite(ThreadState *thr, uptr pc, uptr addr, uptr size);
550551
void MemoryRangeImitateWriteOrResetRange(ThreadState *thr, uptr pc, uptr addr,
551552
uptr size);
552553

554+
void ThreadAtomicBegin(ThreadState *thr, uptr pc);
555+
void ThreadAtomicEnd(ThreadState *thr);
553556
void ThreadIgnoreBegin(ThreadState *thr, uptr pc);
554557
void ThreadIgnoreEnd(ThreadState *thr);
555558
void ThreadIgnoreSyncBegin(ThreadState *thr, uptr pc);

compiler-rt/lib/tsan/rtl/tsan_shadow.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#ifndef TSAN_SHADOW_H
1010
#define TSAN_SHADOW_H
1111

12+
#include "sanitizer_common/sanitizer_common.h"
1213
#include "tsan_defs.h"
1314

1415
namespace __tsan {
@@ -21,8 +22,8 @@ class FastState {
2122
part_.unused0_ = 0;
2223
part_.sid_ = static_cast<u8>(kFreeSid);
2324
part_.epoch_ = static_cast<u16>(kEpochLast);
24-
part_.unused1_ = 0;
2525
part_.ignore_accesses_ = false;
26+
part_.all_atomic_ = false;
2627
}
2728

2829
void SetSid(Sid sid) { part_.sid_ = static_cast<u8>(sid); }
@@ -37,14 +38,18 @@ class FastState {
3738
void ClearIgnoreBit() { part_.ignore_accesses_ = 0; }
3839
bool GetIgnoreBit() const { return part_.ignore_accesses_; }
3940

41+
void SetAtomicBit() { part_.all_atomic_ = 1; }
42+
void ClearAtomicBit() { part_.all_atomic_ = 0; }
43+
bool GetAtomicBit() const { return part_.all_atomic_; }
44+
4045
private:
4146
friend class Shadow;
4247
struct Parts {
4348
u32 unused0_ : 8;
4449
u32 sid_ : 8;
4550
u32 epoch_ : kEpochBits;
46-
u32 unused1_ : 1;
4751
u32 ignore_accesses_ : 1;
52+
u32 all_atomic_ : 1;
4853
};
4954
union {
5055
Parts part_;

openmp/tools/archer/ompt-tsan.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,6 @@ void __attribute__((weak)) __tsan_flush_memory() {}
169169
// Thread Sanitizer is a tool that finds races in code.
170170
// See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
171171
// tsan detects these exact functions by name.
172-
extern "C" {
173172
static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
174173
static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
175174
static void (*AnnotateIgnoreWritesBegin)(const char *, int);
@@ -183,7 +182,8 @@ static void *(*__tsan_get_current_fiber)();
183182
static void *(*__tsan_create_fiber)(unsigned flags);
184183
static void (*__tsan_destroy_fiber)(void *fiber);
185184
static void (*__tsan_switch_to_fiber)(void *fiber, unsigned flags);
186-
}
185+
static void (*AnnotateReductionBegin)(const char *, int);
186+
static void (*AnnotateReductionEnd)(const char *, int);
187187

188188
// This marker is used to define a happens-before arc. The race detector will
189189
// infer an arc from the begin to the end when they share the same pointer
@@ -199,6 +199,10 @@ static void (*__tsan_switch_to_fiber)(void *fiber, unsigned flags);
199199
// Resume checking for racy writes.
200200
#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
201201

202+
// Maps to either AnnotateAllAtomics or AnnotateIgnoreWrites
203+
#define TsanReductionBegin() AnnotateReductionBegin(__FILE__, __LINE__)
204+
#define TsanReductionEnd() AnnotateReductionEnd(__FILE__, __LINE__)
205+
202206
// We don't really delete the clock for now
203207
#define TsanDeleteClock(cv)
204208

@@ -841,7 +845,7 @@ static void ompt_tsan_sync_region(ompt_sync_region_t kind,
841845
// 2. execution of another task.
842846
// For the latter case we will re-enable tracking in task_switch.
843847
Data->InBarrier = true;
844-
TsanIgnoreWritesBegin();
848+
TsanReductionBegin();
845849
}
846850

847851
break;
@@ -874,7 +878,7 @@ static void ompt_tsan_sync_region(ompt_sync_region_t kind,
874878
if (hasReductionCallback < ompt_set_always) {
875879
// We want to track writes after the barrier again.
876880
Data->InBarrier = false;
877-
TsanIgnoreWritesEnd();
881+
TsanReductionEnd();
878882
}
879883

880884
char BarrierIndex = Data->BarrierIndex;
@@ -929,7 +933,7 @@ static void ompt_tsan_reduction(ompt_sync_region_t kind,
929933
case ompt_scope_begin:
930934
switch (kind) {
931935
case ompt_sync_region_reduction:
932-
TsanIgnoreWritesBegin();
936+
TsanReductionBegin();
933937
break;
934938
default:
935939
break;
@@ -938,7 +942,7 @@ static void ompt_tsan_reduction(ompt_sync_region_t kind,
938942
case ompt_scope_end:
939943
switch (kind) {
940944
case ompt_sync_region_reduction:
941-
TsanIgnoreWritesEnd();
945+
TsanReductionEnd();
942946
break;
943947
default:
944948
break;
@@ -1122,7 +1126,7 @@ static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
11221126
FromTask->InBarrier) {
11231127
// We want to ignore writes in the runtime code during barriers,
11241128
// but not when executing tasks with user code!
1125-
TsanIgnoreWritesEnd();
1129+
TsanReductionEnd();
11261130
}
11271131

11281132
// task completed execution
@@ -1164,7 +1168,7 @@ static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
11641168
// Legacy handling for missing reduction callback
11651169
if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
11661170
// We re-enter runtime code which currently performs a barrier.
1167-
TsanIgnoreWritesBegin();
1171+
TsanReductionBegin();
11681172
}
11691173

11701174
// task suspended
@@ -1349,6 +1353,14 @@ static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
13491353
findTsanFunction(__tsan_destroy_fiber, (void (*)(void *)));
13501354
findTsanFunction(__tsan_get_current_fiber, (void *(*)()));
13511355
findTsanFunction(__tsan_switch_to_fiber, (void (*)(void *, unsigned int)));
1356+
findTsanFunctionName(AnnotateReductionBegin, AnnotateAllAtomicBegin, (void (*)(const char *, int)));
1357+
findTsanFunctionName(AnnotateReductionEnd, AnnotateAllAtomicEnd, (void (*)(const char *, int)));
1358+
if (!AnnotateReductionBegin) {
1359+
AnnotateReductionBegin = AnnotateIgnoreWritesBegin;
1360+
AnnotateReductionEnd = AnnotateIgnoreWritesEnd;
1361+
if (archer_flags->verbose)
1362+
std::cout << "Archer uses fallback solution for reductions: might miss some race" << std::endl;
1363+
}
13521364

13531365
SET_CALLBACK(thread_begin);
13541366
SET_CALLBACK(thread_end);
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* parallel-reduction.c -- Archer testcase
3+
*/
4+
5+
//===----------------------------------------------------------------------===//
6+
//
7+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8+
//
9+
// See tools/archer/LICENSE.txt for details.
10+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
// Number of threads is empirical: We need enough (>4) threads so that
15+
// the reduction is really performed hierarchically in the barrier!
16+
17+
// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
18+
// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
19+
20+
// REQUIRES: tsan
21+
#include <omp.h>
22+
#include <stdio.h>
23+
24+
int main(int argc, char *argv[]) {
25+
int var[10]={0,1,2,3,4,5,6,7,8,9};
26+
27+
#pragma omp parallel
28+
{
29+
#pragma omp masked
30+
var[5] = 23;
31+
#pragma omp for reduction(+ : var)
32+
for (int i = 0; i < 1000; i++)
33+
{ var[i%10]++; }
34+
}
35+
fprintf(stderr, "DONE\n");
36+
int error = (var[5] != 123);
37+
return error;
38+
}
39+
40+
// CHECK: ThreadSanitizer: data race
41+
// CHECK: DONE
42+
// CHECK: ThreadSanitizer: reported
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* parallel-reduction.c -- Archer testcase
3+
*/
4+
5+
//===----------------------------------------------------------------------===//
6+
//
7+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8+
//
9+
// See tools/archer/LICENSE.txt for details.
10+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
// Number of threads is empirical: We need enough (>4) threads so that
15+
// the reduction is really performed hierarchically in the barrier!
16+
17+
// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
18+
// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
19+
20+
// REQUIRES: tsan
21+
#include <omp.h>
22+
#include <stdio.h>
23+
24+
int main(int argc, char *argv[]) {
25+
int var[10]={0,1,2,3,4,5,6,7,8,9};
26+
27+
#pragma omp parallel
28+
{
29+
#pragma omp for reduction(+ : var) nowait
30+
for (int i = 0; i < 1000; i++)
31+
{ var[i%10]++; }
32+
#pragma omp masked
33+
var[5] += 23;
34+
}
35+
fprintf(stderr, "DONE\n");
36+
int error = (var[5] != 123);
37+
return error;
38+
}
39+
40+
// CHECK: ThreadSanitizer: data race
41+
// CHECK: DONE
42+
// CHECK: ThreadSanitizer: reported
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* parallel-reduction.c -- Archer testcase
3+
*/
4+
5+
//===----------------------------------------------------------------------===//
6+
//
7+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8+
//
9+
// See tools/archer/LICENSE.txt for details.
10+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
// Number of threads is empirical: We need enough (>4) threads so that
15+
// the reduction is really performed hierarchically in the barrier!
16+
17+
// RUN: env OMP_NUM_THREADS=3 %libarcher-compile-and-run-race | FileCheck %s
18+
// RUN: env OMP_NUM_THREADS=7 %libarcher-compile-and-run-race | FileCheck %s
19+
20+
// REQUIRES: tsan
21+
#include <omp.h>
22+
#include <stdio.h>
23+
24+
int main(int argc, char *argv[]) {
25+
int var = 0;
26+
27+
#pragma omp parallel
28+
{
29+
#pragma omp masked
30+
var = 23;
31+
#pragma omp for reduction(+ : var)
32+
for (int i = 0; i < 100; i++)
33+
{ var++; }
34+
}
35+
fprintf(stderr, "DONE\n");
36+
int error = (var != 123);
37+
return error;
38+
}
39+
40+
// CHECK: ThreadSanitizer: data race
41+
// CHECK: DONE
42+
// CHECK: ThreadSanitizer: reported

0 commit comments

Comments
 (0)