Skip to content

[MLIR][GPU-LLVM] Convert gpu.func to llvm.func #101664

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//===- AttrToLLVMConverter.h - SPIR-V attributes conversion to LLVM - C++ -===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
#define MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_

#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"

namespace mlir {
unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI,
spirv::StorageClass storageClass);
} // namespace mlir

#endif // MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
1 change: 1 addition & 0 deletions mlir/lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ add_subdirectory(SCFToGPU)
add_subdirectory(SCFToOpenMP)
add_subdirectory(SCFToSPIRV)
add_subdirectory(ShapeToStandard)
add_subdirectory(SPIRVCommon)
add_subdirectory(SPIRVToLLVM)
add_subdirectory(TensorToLinalg)
add_subdirectory(TensorToSPIRV)
Expand Down
144 changes: 101 additions & 43 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,58 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
Location loc = gpuFuncOp.getLoc();

SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (const auto [idx, attribution] :
llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
auto type = dyn_cast<MemRefType>(attribution.getType());
assert(type && type.hasStaticShape() && "unexpected type in attribution");

uint64_t numElements = type.getNumElements();

auto elementType =
cast<Type>(typeConverter->convertType(type.getElementType()));
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name =
std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
uint64_t alignment = 0;
if (auto alignAttr =
dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getWorkgroupAttributionAttr(
idx, LLVM::LLVMDialect::getAlignAttrName())))
alignment = alignAttr.getInt();
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
workgroupAddrSpace);
workgroupBuffers.push_back(globalOp);
if (encodeWorkgroupAttributionsAsArguments) {
ArrayRef<BlockArgument> workgroupAttributions =
gpuFuncOp.getWorkgroupAttributions();
std::size_t numAttributions = workgroupAttributions.size();

// Insert all arguments at the end.
unsigned index = gpuFuncOp.getNumArguments();
SmallVector<unsigned> argIndices(numAttributions, index);

// New arguments will simply be `llvm.ptr` with the correct address space
Type workgroupPtrType =
rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
SmallVector<Type> argTypes(numAttributions, workgroupPtrType);

// No argument attributes will be added
DictionaryAttr emptyDict = rewriter.getDictionaryAttr({});
SmallVector<DictionaryAttr> argAttrs(numAttributions, emptyDict);

// Location match function location
SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());

// Perform signature modification
rewriter.modifyOpInPlace(
gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
argIndices, argTypes, argAttrs, argLocs);
});
} else {
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (const auto [idx, attribution] :
llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
auto type = dyn_cast<MemRefType>(attribution.getType());
assert(type && type.hasStaticShape() && "unexpected type in attribution");

uint64_t numElements = type.getNumElements();

auto elementType =
cast<Type>(typeConverter->convertType(type.getElementType()));
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name =
std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
uint64_t alignment = 0;
if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
gpuFuncOp.getWorkgroupAttributionAttr(
idx, LLVM::LLVMDialect::getAlignAttrName())))
alignment = alignAttr.getInt();
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
workgroupAddrSpace);
workgroupBuffers.push_back(globalOp);
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original code

}

// Remap proper input types.
Expand Down Expand Up @@ -101,16 +130,20 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
if (gpuFuncOp.isKernel()) {
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
if (kernelAttributeName)
attributes.emplace_back(*kernelAttributeName, rewriter.getUnitAttr());
// Set the dialect-specific block size attribute if there is one.
if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
attributes.emplace_back(kernelBlockSizeAttributeName.value(),
knownBlockSize);
}
}
LLVM::CConv callingConvention = gpuFuncOp.isKernel()
? kernelCallingConvention
: nonKernelCallingConvention;
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
/*comdat=*/nullptr, attributes);

{
Expand All @@ -125,24 +158,49 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();

for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
global.getAddrSpace());
Value address = rewriter.create<LLVM::AddressOfOp>(
loc, ptrType, global.getSymNameAttr());
Value memory =
rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(), address,
ArrayRef<LLVM::GEPArg>{0, 0});

// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
auto type = cast<MemRefType>(attribution.getType());
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + idx, descr);
if (encodeWorkgroupAttributionsAsArguments) {
unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
assert(numProperArguments >= numAttributions &&
"Expecting attributions to be encoded as arguments already");

// Arguments encoding workgroup attributions will be in positions
// [numProperArguments, numProperArguments+numAttributions)
ArrayRef<BlockArgument> attributionArguments =
gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
numAttributions);
for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
auto [attribution, arg] = vals;
auto type = cast<MemRefType>(attribution.getType());

// Arguments are of llvm.ptr type and attributions are of memref type:
// we need to wrap them in memref descriptors.
Value descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, arg);

// And remap the arguments
signatureConversion.remapInput(numProperArguments + idx, descr);
}
} else {
for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
global.getAddrSpace());
Value address = rewriter.create<LLVM::AddressOfOp>(
loc, ptrType, global.getSymNameAttr());
Value memory =
rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(),
address, ArrayRef<LLVM::GEPArg>{0, 0});

// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
auto type = cast<MemRefType>(attribution.getType());
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + idx, descr);
}
Comment on lines +208 to +226
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Original code

}

// Rewrite private memory attributions to alloca'ed buffers.
Expand Down
51 changes: 41 additions & 10 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,39 @@ struct GPUDynamicSharedMemoryOpLowering
unsigned alignmentBit;
};

struct GPUFuncOpLoweringOptions {
/// The address space to use for `alloca`s in private memory.
unsigned allocaAddrSpace;
/// The address space to use declaring workgroup memory.
unsigned workgroupAddrSpace;

/// The attribute name to use instead of `gpu.kernel`.
std::optional<StringAttr> kernelAttributeName = std::nullopt;
/// The attribute name to to set block size
std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt;

/// The calling convention to use for kernel functions
LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
/// The calling convention to use for non-kernel functions
LLVM::CConv nonKernelCallingConvention = LLVM::CConv::C;

/// Whether to encode workgroup attributions as additional arguments instead
/// of a global variable.
bool encodeWorkgroupAttributionsAsArguments = false;
};
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was getting out of hand. Cleaner this way.


struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
GPUFuncOpLowering(
const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
GPUFuncOpLowering(const LLVMTypeConverter &converter,
const GPUFuncOpLoweringOptions &options)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
kernelAttributeName(kernelAttributeName),
kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
allocaAddrSpace(options.allocaAddrSpace),
workgroupAddrSpace(options.workgroupAddrSpace),
kernelAttributeName(options.kernelAttributeName),
kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
kernelCallingConvention(options.kernelCallingConvention),
nonKernelCallingConvention(options.nonKernelCallingConvention),
encodeWorkgroupAttributionsAsArguments(
options.encodeWorkgroupAttributionsAsArguments) {}

LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
Expand All @@ -57,10 +80,18 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
unsigned workgroupAddrSpace;

/// The attribute name to use instead of `gpu.kernel`.
StringAttr kernelAttributeName;

std::optional<StringAttr> kernelAttributeName;
/// The attribute name to to set block size
std::optional<StringAttr> kernelBlockSizeAttributeName;

/// The calling convention to use for kernel functions
LLVM::CConv kernelCallingConvention;
/// The calling convention to use for non-kernel functions
LLVM::CConv nonKernelCallingConvention;

/// Whether to encode workgroup attributions as additional arguments instead
/// of a global variable.
bool encodeWorkgroupAttributionsAsArguments;
};

/// The lowering of gpu.printf to a call to HIP hostcalls
Expand Down
2 changes: 2 additions & 0 deletions mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ add_mlir_conversion_library(MLIRGPUToLLVMSPV

LINK_LIBS PUBLIC
MLIRGPUDialect
MLIRGPUToGPURuntimeTransforms
MLIRLLVMCommonConversion
MLIRLLVMDialect
MLIRSPIRVAttrToLLVMConversion
MLIRSPIRVDialect
)
25 changes: 22 additions & 3 deletions mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@

#include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"

#include "../GPUCommon/GPUOpsLowering.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
#include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Matchers.h"
Expand Down Expand Up @@ -321,8 +324,8 @@ struct GPUToLLVMSPVConversionPass final
LLVMConversionTarget target(*context);

target.addIllegalOp<gpu::BarrierOp, gpu::BlockDimOp, gpu::BlockIdOp,
gpu::GlobalIdOp, gpu::GridDimOp, gpu::ShuffleOp,
gpu::ThreadIdOp>();
gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
gpu::ReturnOp, gpu::ShuffleOp, gpu::ThreadIdOp>();

populateGpuToLLVMSPVConversionPatterns(converter, patterns);

Expand All @@ -340,11 +343,27 @@ struct GPUToLLVMSPVConversionPass final
namespace mlir {
void populateGpuToLLVMSPVConversionPatterns(LLVMTypeConverter &typeConverter,
RewritePatternSet &patterns) {
patterns.add<GPUBarrierConversion, GPUShuffleConversion,
patterns.add<GPUBarrierConversion, GPUReturnOpLowering, GPUShuffleConversion,
LaunchConfigOpConversion<gpu::BlockIdOp>,
LaunchConfigOpConversion<gpu::GridDimOp>,
LaunchConfigOpConversion<gpu::BlockDimOp>,
LaunchConfigOpConversion<gpu::ThreadIdOp>,
LaunchConfigOpConversion<gpu::GlobalIdOp>>(typeConverter);
constexpr spirv::ClientAPI clientAPI = spirv::ClientAPI::OpenCL;
MLIRContext *context = &typeConverter.getContext();
unsigned privateAddressSpace =
storageClassToAddressSpace(clientAPI, spirv::StorageClass::Function);
unsigned localAddressSpace =
storageClassToAddressSpace(clientAPI, spirv::StorageClass::Workgroup);
OperationName llvmFuncOpName(LLVM::LLVMFuncOp::getOperationName(), context);
StringAttr kernelBlockSizeAttributeName =
LLVM::LLVMFuncOp::getReqdWorkGroupSizeAttrName(llvmFuncOpName);
Comment on lines +358 to +360
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've always thought this should be a static member... Is there a better way to do this? I didn't wanna add the static member function to the LLVM dialect, so I went with this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cannot be static, as an attribute requires the context present in the operation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know, I was wondering if at least the string name should be

patterns.add<GPUFuncOpLowering>(
typeConverter,
GPUFuncOpLoweringOptions{
privateAddressSpace, localAddressSpace,
/*kernelAttributeName=*/std::nullopt, kernelBlockSizeAttributeName,
LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC,
/*encodeWorkgroupAttributionsAsArguments=*/true});
}
} // namespace mlir
16 changes: 9 additions & 7 deletions mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,13 +365,15 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
patterns.add<GPUFuncOpLowering>(
converter, /*allocaAddrSpace=*/0,
/*workgroupAddrSpace=*/
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getKernelFuncAttrName()),
StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getMaxntidAttrName()));
converter,
GPUFuncOpLoweringOptions{
/*allocaAddrSpace=*/0,
/*workgroupAddrSpace=*/
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getKernelFuncAttrName()),
StringAttr::get(&converter.getContext(),
NVVM::NVVMDialect::getMaxntidAttrName())});

populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
"__nv_fmod");
Expand Down
9 changes: 5 additions & 4 deletions mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -372,10 +372,11 @@ void mlir::populateGpuToROCDLConversionPatterns(
patterns.add<GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(
converter,
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
rocdlDialect->getKernelAttrHelper().getName(),
rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName());
GPUFuncOpLoweringOptions{
/*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
/*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
rocdlDialect->getKernelAttrHelper().getName(),
rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
if (Runtime::HIP == runtime) {
patterns.add<GPUPrintfOpToHIPLowering>(converter);
} else if (Runtime::OpenCL == runtime) {
Expand Down
Loading
Loading