llvm · victor-eds · Aug 9, 2024 · Jul 29, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h
@@ -0,0 +1,18 @@
+//===- AttrToLLVMConverter.h - SPIR-V attributes conversion to LLVM - C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
+#define MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
+
+#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
+
+namespace mlir {
+unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI,
+                                    spirv::StorageClass storageClass);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
@@ -53,6 +53,7 @@ add_subdirectory(SCFToGPU)
 add_subdirectory(SCFToOpenMP)
 add_subdirectory(SCFToSPIRV)
 add_subdirectory(ShapeToStandard)
+add_subdirectory(SPIRVCommon)
 add_subdirectory(SPIRVToLLVM)
 add_subdirectory(TensorToLinalg)
 add_subdirectory(TensorToSPIRV)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -25,29 +25,58 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   Location loc = gpuFuncOp.getLoc();
 
   SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
-  workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-  for (const auto [idx, attribution] :
-       llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
-    auto type = dyn_cast<MemRefType>(attribution.getType());
-    assert(type && type.hasStaticShape() && "unexpected type in attribution");
-
-    uint64_t numElements = type.getNumElements();
-
-    auto elementType =
-        cast<Type>(typeConverter->convertType(type.getElementType()));
-    auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
-    std::string name =
-        std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
-    uint64_t alignment = 0;
-    if (auto alignAttr =
-            dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getWorkgroupAttributionAttr(
-                idx, LLVM::LLVMDialect::getAlignAttrName())))
-      alignment = alignAttr.getInt();
-    auto globalOp = rewriter.create<LLVM::GlobalOp>(
-        gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
-        LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
-        workgroupAddrSpace);
-    workgroupBuffers.push_back(globalOp);
+  if (encodeWorkgroupAttributionsAsArguments) {
+    ArrayRef<BlockArgument> workgroupAttributions =
+        gpuFuncOp.getWorkgroupAttributions();
+    std::size_t numAttributions = workgroupAttributions.size();
+
+    // Insert all arguments at the end.
+    unsigned index = gpuFuncOp.getNumArguments();
+    SmallVector<unsigned> argIndices(numAttributions, index);
+
+    // New arguments will simply be `llvm.ptr` with the correct address space
+    Type workgroupPtrType =
+        rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
+    SmallVector<Type> argTypes(numAttributions, workgroupPtrType);
+
+    // No argument attributes will be added
+    DictionaryAttr emptyDict = rewriter.getDictionaryAttr({});
+    SmallVector<DictionaryAttr> argAttrs(numAttributions, emptyDict);
+
+    // Location match function location
+    SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());
+
+    // Perform signature modification
+    rewriter.modifyOpInPlace(
+        gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
+          static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
+              argIndices, argTypes, argAttrs, argLocs);
+        });
+  } else {
+    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+    for (const auto [idx, attribution] :
+         llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+      auto type = dyn_cast<MemRefType>(attribution.getType());
+      assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+      uint64_t numElements = type.getNumElements();
+
+      auto elementType =
+          cast<Type>(typeConverter->convertType(type.getElementType()));
+      auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
+      std::string name =
+          std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
+      uint64_t alignment = 0;
+      if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
+              gpuFuncOp.getWorkgroupAttributionAttr(
+                  idx, LLVM::LLVMDialect::getAlignAttrName())))
+        alignment = alignAttr.getInt();
+      auto globalOp = rewriter.create<LLVM::GlobalOp>(
+          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+          LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
+          workgroupAddrSpace);
+      workgroupBuffers.push_back(globalOp);
+    }
   }
 
   // Remap proper input types.
@@ -101,16 +130,20 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   // attribute. The former is necessary for further translation while the
   // latter is expected by gpu.launch_func.
   if (gpuFuncOp.isKernel()) {
-    attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+    if (kernelAttributeName)
+      attributes.emplace_back(*kernelAttributeName, rewriter.getUnitAttr());
     // Set the dialect-specific block size attribute if there is one.
     if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
       attributes.emplace_back(kernelBlockSizeAttributeName.value(),
                               knownBlockSize);
     }
   }
+  LLVM::CConv callingConvention = gpuFuncOp.isKernel()
+                                      ? kernelCallingConvention
+                                      : nonKernelCallingConvention;
   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
       gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
-      LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
+      LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
       /*comdat=*/nullptr, attributes);
 
   {
@@ -125,24 +158,49 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     rewriter.setInsertionPointToStart(&gpuFuncOp.front());
     unsigned numProperArguments = gpuFuncOp.getNumArguments();
 
-    for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
-      auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
-                                                global.getAddrSpace());
-      Value address = rewriter.create<LLVM::AddressOfOp>(
-          loc, ptrType, global.getSymNameAttr());
-      Value memory =
-          rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(), address,
-                                       ArrayRef<LLVM::GEPArg>{0, 0});
-
-      // Build a memref descriptor pointing to the buffer to plug with the
-      // existing memref infrastructure. This may use more registers than
-      // otherwise necessary given that memref sizes are fixed, but we can try
-      // and canonicalize that away later.
-      Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
-      auto type = cast<MemRefType>(attribution.getType());
-      auto descr = MemRefDescriptor::fromStaticShape(
-          rewriter, loc, *getTypeConverter(), type, memory);
-      signatureConversion.remapInput(numProperArguments + idx, descr);
+    if (encodeWorkgroupAttributionsAsArguments) {
+      unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
+      assert(numProperArguments >= numAttributions &&
+             "Expecting attributions to be encoded as arguments already");
+
+      // Arguments encoding workgroup attributions will be in positions
+      // [numProperArguments, numProperArguments+numAttributions)
+      ArrayRef<BlockArgument> attributionArguments =
+          gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
+                                         numAttributions);
+      for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
+               gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
+        auto [attribution, arg] = vals;
+        auto type = cast<MemRefType>(attribution.getType());
+
+        // Arguments are of llvm.ptr type and attributions are of memref type:
+        // we need to wrap them in memref descriptors.
+        Value descr = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, *getTypeConverter(), type, arg);
+
+        // And remap the arguments
+        signatureConversion.remapInput(numProperArguments + idx, descr);
+      }
+    } else {
+      for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
+        auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                                  global.getAddrSpace());
+        Value address = rewriter.create<LLVM::AddressOfOp>(
+            loc, ptrType, global.getSymNameAttr());
+        Value memory =
+            rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(),
+                                         address, ArrayRef<LLVM::GEPArg>{0, 0});
+
+        // Build a memref descriptor pointing to the buffer to plug with the
+        // existing memref infrastructure. This may use more registers than
+        // otherwise necessary given that memref sizes are fixed, but we can try
+        // and canonicalize that away later.
+        Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
+        auto type = cast<MemRefType>(attribution.getType());
+        auto descr = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, *getTypeConverter(), type, memory);
+        signatureConversion.remapInput(numProperArguments + idx, descr);
+      }
     }
 
     // Rewrite private memory attributions to alloca'ed buffers.

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -35,16 +35,39 @@ struct GPUDynamicSharedMemoryOpLowering
   unsigned alignmentBit;
 };
 
+struct GPUFuncOpLoweringOptions {
+  /// The address space to use for `alloca`s in private memory.
+  unsigned allocaAddrSpace;
+  /// The address space to use declaring workgroup memory.
+  unsigned workgroupAddrSpace;
+
+  /// The attribute name to use instead of `gpu.kernel`.
+  std::optional<StringAttr> kernelAttributeName = std::nullopt;
+  /// The attribute name to to set block size
+  std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt;
+
+  /// The calling convention to use for kernel functions
+  LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
+  /// The calling convention to use for non-kernel functions
+  LLVM::CConv nonKernelCallingConvention = LLVM::CConv::C;
+
+  /// Whether to encode workgroup attributions as additional arguments instead
+  /// of a global variable.
+  bool encodeWorkgroupAttributionsAsArguments = false;
+};
+
 struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
-  GPUFuncOpLowering(
-      const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
-      unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
-      std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
+  GPUFuncOpLowering(const LLVMTypeConverter &converter,
+                    const GPUFuncOpLoweringOptions &options)
       : ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
-        allocaAddrSpace(allocaAddrSpace),
-        workgroupAddrSpace(workgroupAddrSpace),
-        kernelAttributeName(kernelAttributeName),
-        kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
+        allocaAddrSpace(options.allocaAddrSpace),
+        workgroupAddrSpace(options.workgroupAddrSpace),
+        kernelAttributeName(options.kernelAttributeName),
+        kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
+        kernelCallingConvention(options.kernelCallingConvention),
+        nonKernelCallingConvention(options.nonKernelCallingConvention),
+        encodeWorkgroupAttributionsAsArguments(
+            options.encodeWorkgroupAttributionsAsArguments) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -57,10 +80,18 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
   unsigned workgroupAddrSpace;
 
   /// The attribute name to use instead of `gpu.kernel`.
-  StringAttr kernelAttributeName;
-
+  std::optional<StringAttr> kernelAttributeName;
   /// The attribute name to to set block size
   std::optional<StringAttr> kernelBlockSizeAttributeName;
+
+  /// The calling convention to use for kernel functions
+  LLVM::CConv kernelCallingConvention;
+  /// The calling convention to use for non-kernel functions
+  LLVM::CConv nonKernelCallingConvention;
+
+  /// Whether to encode workgroup attributions as additional arguments instead
+  /// of a global variable.
+  bool encodeWorkgroupAttributionsAsArguments;
 };
 
 /// The lowering of gpu.printf to a call to HIP hostcalls

diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt b/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt
@@ -6,7 +6,9 @@ add_mlir_conversion_library(MLIRGPUToLLVMSPV
 
   LINK_LIBS PUBLIC
   MLIRGPUDialect
+  MLIRGPUToGPURuntimeTransforms
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
+  MLIRSPIRVAttrToLLVMConversion
   MLIRSPIRVDialect
 )
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -8,15 +8,18 @@
 
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 
+#include "../GPUCommon/GPUOpsLowering.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
@@ -321,8 +324,8 @@ struct GPUToLLVMSPVConversionPass final
     LLVMConversionTarget target(*context);
 
     target.addIllegalOp<gpu::BarrierOp, gpu::BlockDimOp, gpu::BlockIdOp,
-                        gpu::GlobalIdOp, gpu::GridDimOp, gpu::ShuffleOp,
-                        gpu::ThreadIdOp>();
+                        gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
+                        gpu::ReturnOp, gpu::ShuffleOp, gpu::ThreadIdOp>();
 
     populateGpuToLLVMSPVConversionPatterns(converter, patterns);
 
@@ -340,11 +343,27 @@ struct GPUToLLVMSPVConversionPass final
 namespace mlir {
 void populateGpuToLLVMSPVConversionPatterns(LLVMTypeConverter &typeConverter,
                                             RewritePatternSet &patterns) {
-  patterns.add<GPUBarrierConversion, GPUShuffleConversion,
+  patterns.add<GPUBarrierConversion, GPUReturnOpLowering, GPUShuffleConversion,
                LaunchConfigOpConversion<gpu::BlockIdOp>,
                LaunchConfigOpConversion<gpu::GridDimOp>,
                LaunchConfigOpConversion<gpu::BlockDimOp>,
                LaunchConfigOpConversion<gpu::ThreadIdOp>,
                LaunchConfigOpConversion<gpu::GlobalIdOp>>(typeConverter);
+  constexpr spirv::ClientAPI clientAPI = spirv::ClientAPI::OpenCL;
+  MLIRContext *context = &typeConverter.getContext();
+  unsigned privateAddressSpace =
+      storageClassToAddressSpace(clientAPI, spirv::StorageClass::Function);
+  unsigned localAddressSpace =
+      storageClassToAddressSpace(clientAPI, spirv::StorageClass::Workgroup);
+  OperationName llvmFuncOpName(LLVM::LLVMFuncOp::getOperationName(), context);
+  StringAttr kernelBlockSizeAttributeName =
+      LLVM::LLVMFuncOp::getReqdWorkGroupSizeAttrName(llvmFuncOpName);
+  patterns.add<GPUFuncOpLowering>(
+      typeConverter,
+      GPUFuncOpLoweringOptions{
+          privateAddressSpace, localAddressSpace,
+          /*kernelAttributeName=*/std::nullopt, kernelBlockSizeAttributeName,
+          LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC,
+          /*encodeWorkgroupAttributionsAsArguments=*/true});
 }
 } // namespace mlir
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -365,13 +365,15 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
   // attributions since NVVM models it as `alloca`s in the default
   // memory space and does not support `alloca`s with addrspace(5).
   patterns.add<GPUFuncOpLowering>(
-      converter, /*allocaAddrSpace=*/0,
-      /*workgroupAddrSpace=*/
-      static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
-      StringAttr::get(&converter.getContext(),
-                      NVVM::NVVMDialect::getKernelFuncAttrName()),
-      StringAttr::get(&converter.getContext(),
-                      NVVM::NVVMDialect::getMaxntidAttrName()));
+      converter,
+      GPUFuncOpLoweringOptions{
+          /*allocaAddrSpace=*/0,
+          /*workgroupAddrSpace=*/
+          static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
+          StringAttr::get(&converter.getContext(),
+                          NVVM::NVVMDialect::getKernelFuncAttrName()),
+          StringAttr::get(&converter.getContext(),
+                          NVVM::NVVMDialect::getMaxntidAttrName())});
 
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
                                     "__nv_fmod");

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -372,10 +372,11 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUReturnOpLowering>(converter);
   patterns.add<GPUFuncOpLowering>(
       converter,
-      /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
-      /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
-      rocdlDialect->getKernelAttrHelper().getName(),
-      rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName());
+      GPUFuncOpLoweringOptions{
+          /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
+          /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
+          rocdlDialect->getKernelAttrHelper().getName(),
+          rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
   if (Runtime::HIP == runtime) {
     patterns.add<GPUPrintfOpToHIPLowering>(converter);
   } else if (Runtime::OpenCL == runtime) {