-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Flang][OpenMP] Fix copyin allocatable lowering to MLIR #122097
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-flang-fir-hlfir Author: Kaviya Rajendiran (kaviya2510) ChangesFixes #113191 Issue: [flang][OpenMP] Runtime segfault when an allocatable variable is used with copyin Rootcause: The value of the threadprivate variable is not being copied from the primary thread to the other threads within a parallel region. As a result it tries to access a null pointer inside a parallel region which causes segfault. Fix: When allocatables used with copyin clause need to ensure that, on entry to any parallel region each thread’s copy of a variable will acquire the allocation status of the primary thread, before copying the value of a threadprivate variable of the primary thread to the threadprivate variable of each other member of the team. Full diff: https://github.com/llvm/llvm-project/pull/122097.diff 2 Files Affected:
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 37f51d74d23f8f..8ccb96cbc035df 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1298,10 +1298,15 @@ class FirConverter : public Fortran::lower::AbstractConverter {
} else if (isAllocatable &&
(flags.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) ||
flags.test(Fortran::semantics::Symbol::Flag::OmpCopyIn))) {
- // For firstprivate and copyin allocatable variables, RHS must be copied
+ // For firstprivate allocatable variables, RHS must be copied
// only when LHS is allocated.
+
+ // For copyin allocatable variables, RHS must be copied to lhs
+ // only when rhs is allocated.
+ hlfir::Entity entity =
+ flags.test(Fortran::semantics::Symbol::Flag::OmpCopyIn) ? rhs : lhs;
hlfir::Entity temp =
- hlfir::derefPointersAndAllocatables(loc, *builder, lhs);
+ hlfir::derefPointersAndAllocatables(loc, *builder, entity);
mlir::Value addr = hlfir::genVariableRawAddress(loc, *builder, temp);
mlir::Value isAllocated = builder->genIsNotNullAddr(loc, addr);
builder->genIfThen(loc, isAllocated)
diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90
index f3d147c10668f4..f884f6a2a3ba03 100644
--- a/flang/test/Lower/OpenMP/copyin.f90
+++ b/flang/test/Lower/OpenMP/copyin.f90
@@ -395,12 +395,19 @@ subroutine pointer()
! CHECK: %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatableEp"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 realloc : !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.heap<!fir.array<?xi32>>) -> i64
+! CHECK: %[[C0_I64:.*]] = arith.constant 0 : i64
+! CHECK: %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %c0_i64 : i64
+! CHECK: fir.if %9 {
+! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 realloc : !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: }
! CHECK: omp.barrier
-! CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK: %[[VAL_8:.*]] = fir.box_addr %[[VAL_7]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
-! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-! CHECK: fir.call @_QPsub8(%[[VAL_9]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK: fir.call @_QPsub8(%[[VAL_13]]) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
! CHECK: omp.terminator
! CHECK: }
! CHECK: return
@@ -422,7 +429,7 @@ subroutine allocatable()
! CHECK: omp.parallel {
! CHECK: %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>> -> !fir.ref<!fir.box<!fir.heap<i32>>>
! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatable2Ea"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
-! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
! CHECK: %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.heap<i32>) -> i64
! CHECK: %[[VAL_9:.*]] = arith.constant 0 : i64
@@ -448,3 +455,48 @@ subroutine allocatable2()
a = 1
!$omp end parallel
end subroutine
+
+! CHECK: func.func @_QPallocatable3() {
+! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFallocatable3Ea) : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatable3Ea"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK: %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>> -> !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatable3Ea"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK: %[[VAL_4:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFallocatable3Ea.alloc"}
+! CHECK: %[[VAL_5:.*]] = fir.embox %[[VAL_4]] : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
+! CHECK: fir.store %[[VAL_5]] to %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[C10_I32:.*]] = arith.constant 10 : i32
+! CHECK: hlfir.assign %[[C10_I32]] to %[[VAL_3]]#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: omp.parallel {
+! CHECK: %[[VAL_6:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<!fir.box<!fir.heap<i32>>> -> !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatable3Ea"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (!fir.heap<i32>) -> i64
+! CHECK: %[[C10_I64:.*]] = arith.constant 0 : i64
+! CHECK: %[[VAL_11:.*]] = arith.cmpi ne, %[[VAL_10]], %[[C10_I64]] : i64
+! CHECK: fir.if %[[VAL_11]] {
+! CHECK: %[[VAL_16:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_17:.*]] = fir.box_addr %[[VAL_16]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.heap<i32>
+! CHECK: hlfir.assign %[[VAL_18]] to %[[VAL_7]]#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: }
+! CHECK: omp.barrier
+! CHECK: %[[VAL_12:.*]] = fir.load %7#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_12]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
+! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_13]] : !fir.heap<i32>
+! CHECK: %[[C1_I32:.*]] = arith.constant 1 : i32
+! CHECK: %[[VAL_15:.*]]= arith.addi %[[VAL_14]], %[[C1_I32]] : i32
+! CHECK: hlfir.assign %[[VAL_15]]to %[[VAL_7]]#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: return
+! CHECK: }
+subroutine allocatable3()
+ integer, allocatable, save :: a
+ !$omp threadprivate(a)
+ allocate(a)
+ a = 10
+ !$omp parallel copyin(a)
+ a = a + 1
+ !$omp end parallel
+end subroutine
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The patch makes sense for that direction (original item allocated, thread copy unallocated), but I think the logic is probably broken in the other direction too (original item unallocated, copy allocated).
The requirement for copyin in OpenMP spec says that "If the original list item does not
have the POINTER attribute but has the allocation status of unallocated, each copy will have the same status."
So the copy should be unallocated in the else case if it allocated.
Example:
subroutine allocatable4()
use omp_lib, only: omp_get_thread_num
integer, allocatable, save :: a
!$omp threadprivate(a)
!$omp parallel
allocate(a)
!$omp end parallel
deallocate(a)
!$omp parallel copyin(a)
print *, omp_get_thread_num(), allocated(a)
!$omp end parallel
end subroutine
call allocatable4()
end
This should print (in any order):
0 F
3 F
1 F
2 F
But I think it will print T
with flang outside of the main thread after this patch (and would crash before the patch).
flang/lib/Lower/Bridge.cpp
Outdated
@@ -1298,10 +1298,15 @@ class FirConverter : public Fortran::lower::AbstractConverter { | |||
} else if (isAllocatable && | |||
(flags.test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate) || | |||
flags.test(Fortran::semantics::Symbol::Flag::OmpCopyIn))) { | |||
// For firstprivate and copyin allocatable variables, RHS must be copied | |||
// For firstprivate allocatable variables, RHS must be copied |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the firstprivate case could also use the RHS allocation status too for simplicity (the LHS allocation status was just copied by the code generated by createHostAssociateVarClone
, so it does not really matter whether RHS/LHS is used here).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added a separate check for copyin clause. So I think this change isn't necessary.
Thanks for the comments @jeanPerier. |
b558774
to
827d1ff
Compare
Hi @jeanPerier ,I have addressed your review comments in the latest commit. Please take a look it and reply me back if you have any suggestions. |
…ted or allocated status from the original list item
827d1ff
to
bc6750d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for addressing my comments, LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks.
Thanks for the review @tblah @jeanPerier @luporl . |
Fixes #113191
Issue: [flang][OpenMP] Runtime segfault when an allocatable variable is used with copyin
Rootcause: The value of the threadprivate variable is not being copied from the primary thread to the other threads within a parallel region. As a result it tries to access a null pointer inside a parallel region which causes segfault.
Fix: When allocatables used with copyin clause need to ensure that, on entry to any parallel region each thread’s copy of a variable will acquire the allocation status of the primary thread, before copying the value of a threadprivate variable of the primary thread to the threadprivate variable of each other member of the team.