Skip to content

[Flang][OpenMP] Add support for schedule clause for GPU #81618

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -925,11 +925,15 @@ class OpenMPIRBuilder {
/// preheader of the loop.
/// \param LoopType Information about type of loop worksharing.
/// It corresponds to type of loop workshare OpenMP pragma.
/// \param ScheduleType Information about scheduling type.
/// \param ChunkSize Value of chunk size for static schedule.
///
/// \returns Point where to insert code after the workshare construct.
InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
omp::WorksharingLoopType LoopType);
omp::WorksharingLoopType LoopType,
omp::OMPScheduleType ScheduleType,
Value *ChunkSize);

/// Modifies the canonical loop to be a statically-scheduled workshare loop.
///
Expand Down
49 changes: 37 additions & 12 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
static void createTargetLoopWorkshareCall(
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
Value *ThreadChunkSize) {
Type *TripCountTy = TripCount->getType();
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
Expand All @@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(

RealArgs.push_back(
Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
switch (LoopType) {
case WorksharingLoopType::DistributeForStaticLoop:
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
ThreadChunkSize, TripCountTy))
: RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
break;
case WorksharingLoopType::DistributeStaticLoop:
RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
break;
case WorksharingLoopType::ForStaticLoop:
ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
ThreadChunkSize, TripCountTy))
: RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
break;
}

Builder.CreateCall(RTLFn, RealArgs);
Expand All @@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
CanonicalLoopInfo *CLI, Value *Ident,
Function &OutlinedFn, Type *ParallelTaskPtr,
const SmallVector<Instruction *, 4> &ToBeDeleted,
WorksharingLoopType LoopType) {
WorksharingLoopType LoopType, Value *ChunkSize) {
IRBuilder<> &Builder = OMPIRBuilder->Builder;
BasicBlock *Preheader = CLI->getPreheader();
Value *TripCount = CLI->getTripCount();
Expand Down Expand Up @@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,

createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
LoopBodyArg, ParallelTaskPtr, TripCount,
OutlinedFn);
OutlinedFn, ChunkSize);

for (auto &ToBeDeletedItem : ToBeDeleted)
ToBeDeletedItem->eraseFromParent();
CLI->invalidate();
}

OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
WorksharingLoopType LoopType) {
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
Value *ChunkSize) {

uint32_t SrcLocStrSize;
Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
Expand All @@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
// Instructions which need to be deleted at the end of code generation
SmallVector<Instruction *, 4> ToBeDeleted;

// TODO: Add support for dynamic scheduling
switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
case OMPScheduleType::BaseStatic:
case OMPScheduleType::BaseStaticChunked:
break;
default:
report_fatal_error(
"Unknown/unimplemented schedule kind for target workshare loop", false);
}

OI.OuterAllocaBB = AllocaIP.getBlock();

// Mark the body loop as region which needs to be extracted
Expand Down Expand Up @@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
OI.PostOutlineCB = [=, ToBeDeletedVec =
std::move(ToBeDeleted)](Function &OutlinedFn) {
workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
ToBeDeletedVec, LoopType);
ToBeDeletedVec, LoopType, ChunkSize);
};
addOutlineInfo(std::move(OI));
return CLI->getAfterIP();
Expand All @@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
bool HasSimdModifier, bool HasMonotonicModifier,
bool HasNonmonotonicModifier, bool HasOrderedClause,
WorksharingLoopType LoopType) {
if (Config.isTargetDevice())
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
HasNonmonotonicModifier, HasOrderedClause);
if (Config.isTargetDevice())
return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
EffectiveScheduleType, ChunkSize);

bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
OMPScheduleType::ModifierOrdered;
Expand Down
18 changes: 18 additions & 0 deletions mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
}
llvm.return
}

llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
%loop_ub = llvm.mlir.constant(9 : i32) : i32
%loop_lb = llvm.mlir.constant(0 : i32) : i32
%loop_step = llvm.mlir.constant(1 : i32) : i32
%chunk = llvm.mlir.constant(2 : i32) : i32
omp.wsloop schedule(static = %chunk : i32) for (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
%gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
llvm.store %loop_cnt, %gep : i32, !llvm.ptr
omp.yield
}
llvm.return
}
}

// CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
Expand All @@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)

// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])

// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)

// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
56 changes: 28 additions & 28 deletions openmp/libomptarget/DeviceRTL/src/Workshare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -683,38 +683,38 @@ template <typename Ty> class StaticLoopChunker {
Ty NumIters,
bool OneIterationPerThread) {
Ty KernelIteration = NumBlocks * BlockChunk;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need *ThreadChunk here too?

Let's say we have 5 blocks, and each block does a chunk of 3.
Each block has 11 threads and a chunk size of 2.
What I'd expect to work on in one iteration of the do loop below is:

Iteration   : 0      1      2    3    ...   20      21
Block/Thread: B0T0, B0T0, B0T1, B0T1, ..., B0T10, B0T10
Iteration   : 66     67     68   69    ...  86      87
Block/Thread: B1T0, B1T0, B1T1, B1T1, ..., B1T10, B1T10
...
Iteration   : 264    265   266  267    ...  284     285
Block/Thread: B4T0, B4T0, B4T1, B4T1, ..., B4T10, B4T10

So, 2 * 11 = 22 iterations for a block and 5 * 22 = 110 iterations for the kernel.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current implementation matches clang chunking scheme.

Ty BlockIV = BId * BlockChunk;

// Start index in the chunked space.
Ty IV = BId * BlockChunk + TId;
ASSERT(IV >= 0, "Bad index");

ASSERT((BlockIV + TId * ThreadChunk) >= 0, "Bad index");
// Cover the entire iteration space, assumptions in the caller might allow
// to simplify this loop to a conditional.
do {

Ty BlockChunkLeft =
BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
Ty ThreadChunkLeft =
ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;

while (ThreadChunkLeft--) {

// Given the blocking it's hard to keep track of what to execute.
if (IV >= NumIters)
return;

// Execute the loop body.
LoopBody(IV, Arg);

if (OneIterationPerThread)
return;

++IV;
Ty ThreadIV = TId * ThreadChunk;
// Cover the block space
while (ThreadIV < BlockChunk) {
Ty ThreadCnt = 0;
// Cover the thread space
while ((ThreadCnt < ThreadChunk) &&
((ThreadIV + ThreadCnt) < BlockChunk)) {
// Index in the chunked space.
Ty IV = BlockIV + ThreadIV + ThreadCnt;

// Given the blocking it's hard to keep track of what to execute.
if (IV >= NumIters)
return;

// Execute the loop body.
LoopBody(IV, Arg);

if (OneIterationPerThread)
return;
++ThreadCnt;
Comment on lines +695 to +711
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to make this a for loop with a min condition:

Ty TripCnt = min(ThreadChunk, BlockChunk - ThreadIV, NumIters - ThreadIV - BlockIV);
for (Ty Cnt = 0; Cnt < TripCnt; ++Cnt) {
  Ty IV = BlockIV + ThreadIV + Cnt;
  LoopBody(IV, Arg)
  if (OneIterationPerThread) 
    return;  
}

};
ThreadIV += (NumThreads * ThreadChunk);
}

IV += KernelIteration;

} while (IV < NumIters);
BlockIV += KernelIteration;
} while (BlockIV < NumIters);
}

public:
Expand All @@ -731,8 +731,8 @@ template <typename Ty> class StaticLoopChunker {
// from the `omp` getter and not the mapping directly.
Ty TId = omp_get_thread_num();

// There are no blocks involved here.
Ty BlockChunk = 0;
// There is only one block for the whole iteration space.
Ty BlockChunk = NumIters;
Ty NumBlocks = 1;
Ty BId = 0;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
! Offloading test with a target region and chunks
! REQUIRES: flang
! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
! UNSUPPORTED: aarch64-unknown-linux-gnu
! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
! UNSUPPORTED: x86_64-pc-linux-gnu
! UNSUPPORTED: x86_64-pc-linux-gnu-LTO

! RUN: %libomptarget-compile-fortran-generic
! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic

program main
use omp_lib
integer :: A(100)
!$omp target map(from:A)
!$omp parallel do schedule(static,2) num_threads(10)
do index_ = 1, 100
A(index_) = omp_get_team_num() * 1000 + omp_get_thread_num()
end do
!$omp end target
write(*,"(A)"), "omp target parallel for thread chunk size 2"
call printArray(A)

end program main

subroutine printArray(Array)
integer :: Array(*)
do i = 1, 100
write(*, "(A, I0, A, I0, A)", advance="no") "B",Array(i)/1000,"T",modulo(Array(i),1000)," "
end do
write(*,'(/)')
end subroutine printArray

!CHECK: omp target parallel for thread chunk size 2

!CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9