llvm · DominikAdamski · Feb 9, 2024 · Feb 27, 2024 · jdoerfert · Feb 20, 2024
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -925,11 +925,15 @@ class OpenMPIRBuilder {
   ///                 preheader of the loop.
   /// \param LoopType Information about type of loop worksharing.
   ///                 It corresponds to type of loop workshare OpenMP pragma.
+  /// \param ScheduleType Information about scheduling type.
+  /// \param ChunkSize    Value of chunk size for static schedule.
   ///
   /// \returns Point where to insert code after the workshare construct.
   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
                                          InsertPointTy AllocaIP,
-                                         omp::WorksharingLoopType LoopType);
+                                         omp::WorksharingLoopType LoopType,
+                                         omp::OMPScheduleType ScheduleType,
+                                         Value *ChunkSize);
 
   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
   ///

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2728,7 +2728,8 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
 static void createTargetLoopWorkshareCall(
     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
-    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn,
+    Value *ThreadChunkSize) {
   Type *TripCountTy = TripCount->getType();
   Module &M = OMPBuilder->M;
   IRBuilder<> &Builder = OMPBuilder->Builder;
@@ -2751,9 +2752,21 @@ static void createTargetLoopWorkshareCall(
 
   RealArgs.push_back(
       Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
-  RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
-  if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
+  switch (LoopType) {
+  case WorksharingLoopType::DistributeForStaticLoop:
+    RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::DistributeStaticLoop:
     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
+  case WorksharingLoopType::ForStaticLoop:
+    ThreadChunkSize ? RealArgs.push_back(Builder.CreateZExtOrTrunc(
+                          ThreadChunkSize, TripCountTy))
+                    : RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
+    break;
   }
 
   Builder.CreateCall(RTLFn, RealArgs);
@@ -2764,7 +2777,7 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
                             CanonicalLoopInfo *CLI, Value *Ident,
                             Function &OutlinedFn, Type *ParallelTaskPtr,
                             const SmallVector<Instruction *, 4> &ToBeDeleted,
-                            WorksharingLoopType LoopType) {
+                            WorksharingLoopType LoopType, Value *ChunkSize) {
   IRBuilder<> &Builder = OMPIRBuilder->Builder;
   BasicBlock *Preheader = CLI->getPreheader();
   Value *TripCount = CLI->getTripCount();
@@ -2811,17 +2824,18 @@ workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
 
   createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
                                 LoopBodyArg, ParallelTaskPtr, TripCount,
-                                OutlinedFn);
+                                OutlinedFn, ChunkSize);
 
   for (auto &ToBeDeletedItem : ToBeDeleted)
     ToBeDeletedItem->eraseFromParent();
   CLI->invalidate();
 }
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
-                                          InsertPointTy AllocaIP,
-                                          WorksharingLoopType LoopType) {
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
+    DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
+    WorksharingLoopType LoopType, OMPScheduleType EffectiveScheduleType,
+    Value *ChunkSize) {
+
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
@@ -2833,6 +2847,16 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   // Instructions which need to be deleted at the end of code generation
   SmallVector<Instruction *, 4> ToBeDeleted;
 
+  // TODO: Add support for dynamic scheduling
+  switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
+  case OMPScheduleType::BaseStatic:
+  case OMPScheduleType::BaseStaticChunked:
+    break;
+  default:
+    report_fatal_error(
+        "Unknown/unimplemented schedule kind for target workshare loop", false);
+  }
+
   OI.OuterAllocaBB = AllocaIP.getBlock();
 
   // Mark the body loop as region which needs to be extracted
@@ -2906,7 +2930,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   OI.PostOutlineCB = [=, ToBeDeletedVec =
                              std::move(ToBeDeleted)](Function &OutlinedFn) {
     workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
-                                ToBeDeletedVec, LoopType);
+                                ToBeDeletedVec, LoopType, ChunkSize);
   };
   addOutlineInfo(std::move(OI));
   return CLI->getAfterIP();
@@ -2918,11 +2942,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
     bool HasSimdModifier, bool HasMonotonicModifier,
     bool HasNonmonotonicModifier, bool HasOrderedClause,
     WorksharingLoopType LoopType) {
-  if (Config.isTargetDevice())
-    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
   OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
       SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
       HasNonmonotonicModifier, HasOrderedClause);
+  if (Config.isTargetDevice())
+    return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType,
+                                    EffectiveScheduleType, ChunkSize);
 
   bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
                    OMPScheduleType::ModifierOrdered;

diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -25,6 +25,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
       }
     llvm.return
   }
+
+  llvm.func @target_wsloop_schedule_static_chunked(%arg0: !llvm.ptr ){
+      %loop_ub = llvm.mlir.constant(9 : i32) : i32
+      %loop_lb = llvm.mlir.constant(0 : i32) : i32
+      %loop_step = llvm.mlir.constant(1 : i32) : i32
+      %chunk = llvm.mlir.constant(2 : i32) : i32
+      omp.wsloop schedule(static = %chunk : i32) for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+        omp.yield
+      }
+    llvm.return
+  }
 }
 
 // CHECK: define void @[[FUNC0:.*]](ptr %[[ARG0:.*]])
@@ -45,3 +58,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
 
 // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
+
+// CHECK: define void @[[FUNC_SCHEDULE_STATIC_WSLOOP:.*]](ptr %[[ARG1:.*]])
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), ptr @[[LOOP_BODY_SCHEDULE_STATIC_FN:.*]], ptr %[[SCHEDULE_LOOP_ARGS:.*]], i32 10, i32 %[[NUM_THREADS:.*]], i32 2)
+
+// CHECK: define internal void @[[LOOP_BODY_SCHEDULE_STATIC_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -683,38 +683,38 @@ template <typename Ty> class StaticLoopChunker {
                                         Ty NumIters,
                                         bool OneIterationPerThread) {
     Ty KernelIteration = NumBlocks * BlockChunk;
+    Ty BlockIV = BId * BlockChunk;
 
-    // Start index in the chunked space.
-    Ty IV = BId * BlockChunk + TId;
-    ASSERT(IV >= 0, "Bad index");
-
+    ASSERT((BlockIV + TId * ThreadChunk) >= 0, "Bad index");
     // Cover the entire iteration space, assumptions in the caller might allow
     // to simplify this loop to a conditional.
     do {
-
-      Ty BlockChunkLeft =
-          BlockChunk >= TId * ThreadChunk ? BlockChunk - TId * ThreadChunk : 0;
-      Ty ThreadChunkLeft =
-          ThreadChunk <= BlockChunkLeft ? ThreadChunk : BlockChunkLeft;
-
-      while (ThreadChunkLeft--) {
-
-        // Given the blocking it's hard to keep track of what to execute.
-        if (IV >= NumIters)
-          return;
-
-        // Execute the loop body.
-        LoopBody(IV, Arg);
-
-        if (OneIterationPerThread)
-          return;
-
-        ++IV;
+      Ty ThreadIV = TId * ThreadChunk;
+      // Cover the block space
+      while (ThreadIV < BlockChunk) {
+        Ty ThreadCnt = 0;
+        // Cover the thread space
+        while ((ThreadCnt < ThreadChunk) &&
+               ((ThreadIV + ThreadCnt) < BlockChunk)) {
+          // Index in the chunked space.
+          Ty IV = BlockIV + ThreadIV + ThreadCnt;
+
+          // Given the blocking it's hard to keep track of what to execute.
+          if (IV >= NumIters)
+            return;
+
+          // Execute the loop body.
+          LoopBody(IV, Arg);
+
+          if (OneIterationPerThread)
+            return;
+          ++ThreadCnt;
+        };
+        ThreadIV += (NumThreads * ThreadChunk);
       }
 
-      IV += KernelIteration;
-
-    } while (IV < NumIters);
+      BlockIV += KernelIteration;
+    } while (BlockIV < NumIters);
   }
 
 public:
@@ -731,8 +731,8 @@ template <typename Ty> class StaticLoopChunker {
     // from the `omp` getter and not the mapping directly.
     Ty TId = omp_get_thread_num();
 
-    // There are no blocks involved here.
-    Ty BlockChunk = 0;
+    // There is only one block for the whole iteration space.
+    Ty BlockChunk = NumIters;
     Ty NumBlocks = 1;
     Ty BId = 0;
 

diff --git a/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90 b/openmp/libomptarget/test/offloading/fortran/target_workshare_loop_static_chunk.f90
@@ -0,0 +1,46 @@
+! Offloading test with a target region and chunks
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+program main
+  use omp_lib
+        integer :: A(100)
+!$omp target map(from:A)
+!$omp parallel do schedule(static,2) num_threads(10)
+        do index_ = 1, 100
+          A(index_) = omp_get_team_num() * 1000 + omp_get_thread_num()
+        end do
+!$omp end target
+        write(*,"(A)"), "omp target parallel for thread chunk size 2"
+        call printArray(A)
+
+end program main
+
+subroutine printArray(Array)
+        integer :: Array(*)
+        do i = 1, 100
+            write(*, "(A, I0, A, I0, A)", advance="no") "B",Array(i)/1000,"T",modulo(Array(i),1000)," "
+        end do
+        write(*,'(/)')
+end subroutine printArray
+
+!CHECK:      omp target parallel for thread chunk size 2
+
+!CHECK-NEXT: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+!CHECK-SAME: B0T0 B0T0 B0T1 B0T1 B0T2 B0T2 B0T3 B0T3 B0T4 B0T4
+!CHECK-SAME: B0T5 B0T5 B0T6 B0T6 B0T7 B0T7 B0T8 B0T8 B0T9 B0T9
+