[flang][cuda] Convert global allocation for pinned variable (#106807)

clementval · web-flow · commit dfc21acdfa0e · 2024-09-03T14:27:16.000-07:00
ALLOCATE/DEALLOCATE statements for module allocatable variable with the
pinned attribute can be lowered to the standard runtime call and do not
need further action since these variables will have a unique descriptor
that is on the host.
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -33,17 +33,25 @@ using namespace Fortran::runtime::cuda;
 namespace {
 
 template <typename OpTy>
-static bool isBoxGlobal(OpTy op) {
+static bool needDoubleDescriptor(OpTy op) {
   if (auto declareOp =
           mlir::dyn_cast_or_null<fir::DeclareOp>(op.getBox().getDefiningOp())) {
     if (mlir::isa_and_nonnull<fir::AddrOfOp>(
-            declareOp.getMemref().getDefiningOp()))
+            declareOp.getMemref().getDefiningOp())) {
+      if (declareOp.getDataAttr() &&
+          *declareOp.getDataAttr() == cuf::DataAttribute::Pinned)
+        return false;
       return true;
+    }
   } else if (auto declareOp = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
                  op.getBox().getDefiningOp())) {
     if (mlir::isa_and_nonnull<fir::AddrOfOp>(
-            declareOp.getMemref().getDefiningOp()))
+            declareOp.getMemref().getDefiningOp())) {
+      if (declareOp.getDataAttr() &&
+          *declareOp.getDataAttr() == cuf::DataAttribute::Pinned)
+        return false;
       return true;
+    }
   }
   return false;
 }
@@ -100,7 +108,7 @@ struct CufAllocateOpConversion
 
     // TODO: Allocation of module variable will need more work as the descriptor
     // will be duplicated and needs to be synced after allocation.
-    if (isBoxGlobal(op))
+    if (needDoubleDescriptor(op))
       return mlir::failure();
 
     // Allocation for local descriptor falls back on the standard runtime
@@ -125,7 +133,7 @@ struct CufDeallocateOpConversion
                   mlir::PatternRewriter &rewriter) const override {
     // TODO: Allocation of module variable will need more work as the descriptor
     // will be duplicated and needs to be synced after allocation.
-    if (isBoxGlobal(op))
+    if (needDoubleDescriptor(op))
       return mlir::failure();
 
     // Deallocation for local descriptor falls back on the standard runtime
@@ -274,9 +282,9 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
       return true;
     });
     target.addDynamicallyLegalOp<cuf::AllocateOp>(
-        [](::cuf::AllocateOp op) { return isBoxGlobal(op); });
+        [](::cuf::AllocateOp op) { return needDoubleDescriptor(op); });
     target.addDynamicallyLegalOp<cuf::DeallocateOp>(
-        [](::cuf::DeallocateOp op) { return isBoxGlobal(op); });
+        [](::cuf::DeallocateOp op) { return needDoubleDescriptor(op); });
     target.addLegalDialect<fir::FIROpsDialect>();
     patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
     patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -68,6 +68,31 @@ func.func @_QPsub4() attributes {cuf.proc_attr = #cuf.cuda_proc<device>} {
 // CHECK: fir.alloca
 // CHECK-NOT: cuf.free
 
+fir.global @_QMglobalsEa_pinned {data_attr = #cuf.cuda<pinned>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
+  %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+  %c0 = arith.constant 0 : index
+  %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %2 = fir.embox %0(%1) {allocator_idx = 1 : i32} : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+  fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
 }
 
+func.func @_QPsub5() {
+  %4 = fir.address_of(@_QMglobalsEa_pinned) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %5:2 = hlfir.declare %4 {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMglobalsEa_pinned"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %c1 = arith.constant 1 : index
+  %c10_i32 = arith.constant 10 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %6 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %7 = fir.convert %c1 : (index) -> i64
+  %8 = fir.convert %c10_i32 : (i32) -> i64
+  %9 = fir.call @_FortranAAllocatableSetBounds(%6, %c0_i32, %7, %8) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> none
+  %10 = cuf.allocate %5#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
+  %11 = cuf.deallocate %5#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub5()
+// CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
+} // end of module