Skip to content

[MLIR] Fixes NVGPU Integration Test Passes Ordering #69934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 24, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 55 additions & 227 deletions mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
Expand All @@ -39,27 +41,11 @@ using namespace mlir;
namespace {
struct TestLowerToNVVMOptions
: public PassPipelineOptions<TestLowerToNVVMOptions> {
PassOptions::Option<int64_t> hostIndexBitWidth{
*this, "host-index-bitwidth",
PassOptions::Option<int64_t> indexBitWidth{
*this, "index-bitwidth",
llvm::cl::desc("Bitwidth of the index type for the host (warning this "
"should be 64 until the GPU layering is fixed)"),
llvm::cl::init(64)};
PassOptions::Option<bool> hostUseBarePtrCallConv{
*this, "host-bare-ptr-calling-convention",
llvm::cl::desc(
"Whether to use the bareptr calling convention on the host (warning "
"this should be false until the GPU layering is fixed)"),
llvm::cl::init(false)};
PassOptions::Option<int64_t> kernelIndexBitWidth{
*this, "kernel-index-bitwidth",
llvm::cl::desc("Bitwidth of the index type for the GPU kernels"),
llvm::cl::init(64)};
PassOptions::Option<bool> kernelUseBarePtrCallConv{
*this, "kernel-bare-ptr-calling-convention",
llvm::cl::desc(
"Whether to use the bareptr calling convention on the kernel "
"(warning this should be false until the GPU layering is fixed)"),
llvm::cl::init(false)};
PassOptions::Option<std::string> cubinTriple{
*this, "cubin-triple",
llvm::cl::desc("Triple to use to serialize to cubin."),
Expand All @@ -74,175 +60,78 @@ struct TestLowerToNVVMOptions
PassOptions::Option<std::string> cubinFormat{
*this, "cubin-format",
llvm::cl::desc("Compilation format to use to serialize to cubin."),
llvm::cl::init("isa")};
llvm::cl::init("bin")};
PassOptions::Option<int> optLevel{
*this, "opt-level",
llvm::cl::desc("Optimization level for NVVM compilation"),
llvm::cl::init(2)};
};

//===----------------------------------------------------------------------===//
// Common pipeline
//===----------------------------------------------------------------------===//
void buildCommonPassPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
pm.addPass(createConvertNVGPUToNVVMPass());
pm.addPass(createGpuKernelOutliningPass());
pm.addPass(createConvertLinalgToLoopsPass());
pm.addPass(createConvertVectorToSCFPass());
pm.addPass(createConvertSCFToCFPass());
pm.addPass(createConvertNVVMToLLVMPass());
pm.addPass(createConvertVectorToLLVMPass());
pm.addPass(createConvertMathToLLVMPass());
pm.addPass(createFinalizeMemRefToLLVMConversionPass());
pm.addPass(createConvertFuncToLLVMPass());
pm.addPass(memref::createExpandStridedMetadataPass());

GpuNVVMAttachTargetOptions nvvmTargetOptions;
nvvmTargetOptions.triple = options.cubinTriple;
nvvmTargetOptions.chip = options.cubinChip;
nvvmTargetOptions.features = options.cubinFeatures;
nvvmTargetOptions.optLevel = options.optLevel;
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
pm.addPass(createLowerAffinePass());
pm.addPass(createArithToLLVMConversionPass());
ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
}

//===----------------------------------------------------------------------===//
// GPUModule-specific stuff.
//===----------------------------------------------------------------------===//
void buildGpuPassPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
}

pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
// Convert SCF to CF (always needed).
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
// Convert Math to LLVM (always needed).
pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToLLVMPass());
// Expand complicated MemRef operations before lowering them.
pm.addNestedPass<gpu::GPUModuleOp>(memref::createExpandStridedMetadataPass());
// The expansion may create affine expressions. Get rid of them.
pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());

// Convert MemRef to LLVM (always needed).
// TODO: C++20 designated initializers.
FinalizeMemRefToLLVMConversionPassOptions
finalizeMemRefToLLVMConversionPassOptions;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
options.kernelIndexBitWidth;
finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
pm.addNestedPass<gpu::GPUModuleOp>(createFinalizeMemRefToLLVMConversionPass(
finalizeMemRefToLLVMConversionPassOptions));

// Convert Func to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
convertFuncToLLVMPassOptions.indexBitwidth = options.kernelIndexBitWidth;
convertFuncToLLVMPassOptions.useBarePtrCallConv =
options.kernelUseBarePtrCallConv;
convertFuncToLLVMPassOptions.useOpaquePointers = true;
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));

// TODO: C++20 designated initializers.
ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
convertIndexToLLVMPassOpt.indexBitwidth = options.kernelIndexBitWidth;
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));

// TODO: C++20 designated initializers.
// The following pass is inconsistent.
// TODO: fix inconsistence.
ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
options.kernelUseBarePtrCallConv;
convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));

pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());

// Convert vector to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
convertVectorToLLVMPassOptions.reassociateFPReductions = true;
pm.addNestedPass<gpu::GPUModuleOp>(
createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));

// This pass is needed for PTX building
pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
//===----------------------------------------------------------------------===//
// Host Post-GPU pipeline
//===----------------------------------------------------------------------===//
void buildHostPostPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
pm.addPass(createGpuToLLVMConversionPass());

// Sprinkle some cleanups.
GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());

// Finally we can reconcile unrealized casts.
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
pm.addPass(createReconcileUnrealizedCastsPass());
}

void buildLowerToNVVMPassPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
// Start with a cleanup pass.
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());

//===----------------------------------------------------------------------===//
// NVGPU lowers device code as well as host code to the driver, so must run
// before outlining.
// Common pipeline
//===----------------------------------------------------------------------===//
// TODO: C++20 designated initializers.
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
pm.addNestedPass<func::FuncOp>(
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));

//===----------------------------------------------------------------------===//
// Host-specific stuff.
//===----------------------------------------------------------------------===//
// Important, must be run at the top-level.
pm.addPass(createGpuKernelOutliningPass());

// Important, all host passes must be run at the func level so that host
// conversions can remain with 64 bit indices without polluting the GPU
// kernel that may have 32 bit indices.
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
// Convert SCF to CF (always needed).
pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
// Convert Math to LLVM (always needed).
pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
// Expand complicated MemRef operations before lowering them.
pm.addNestedPass<func::FuncOp>(memref::createExpandStridedMetadataPass());
// The expansion may create affine expressions. Get rid of them.
pm.addNestedPass<func::FuncOp>(createLowerAffinePass());

// Convert MemRef to LLVM (always needed).
// TODO: C++20 designated initializers.
FinalizeMemRefToLLVMConversionPassOptions
finalizeMemRefToLLVMConversionPassOptions;
finalizeMemRefToLLVMConversionPassOptions.useAlignedAlloc = true;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
options.hostIndexBitWidth;
finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
pm.addNestedPass<func::FuncOp>(createFinalizeMemRefToLLVMConversionPass(
finalizeMemRefToLLVMConversionPassOptions));

// Convert Func to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
convertFuncToLLVMPassOptions.indexBitwidth = options.hostIndexBitWidth;
convertFuncToLLVMPassOptions.useBarePtrCallConv =
options.hostUseBarePtrCallConv;
convertFuncToLLVMPassOptions.useOpaquePointers = true;
pm.addNestedPass<func::FuncOp>(
createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));

// TODO: C++20 designated initializers.
ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
pm.addNestedPass<func::FuncOp>(
createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));

pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());

// Sprinkle some cleanups.
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(createCSEPass());
buildCommonPassPipeline(pm, options);

//===----------------------------------------------------------------------===//
// GPUModule-specific stuff.
Expand All @@ -252,68 +141,7 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
//===----------------------------------------------------------------------===//
// Host post-GPUModule-specific stuff.
//===----------------------------------------------------------------------===//
// Attach an NVVM target to all the GPU modules with the provided target
// options.
// TODO: C++20 designated initializers.
GpuNVVMAttachTargetOptions nvvmTargetOptions;
nvvmTargetOptions.triple = options.cubinTriple;
nvvmTargetOptions.chip = options.cubinChip;
nvvmTargetOptions.features = options.cubinFeatures;
nvvmTargetOptions.optLevel = options.optLevel;
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));

// Convert GPU to LLVM.
// TODO: C++20 designated initializers.
GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
// Note: hostBarePtrCallConv must be false for now otherwise
// gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
// lower the to bare ptr.
gpuToLLVMConversionOptions.hostBarePtrCallConv =
options.hostUseBarePtrCallConv;
gpuToLLVMConversionOptions.kernelBarePtrCallConv =
options.kernelUseBarePtrCallConv;
gpuToLLVMConversionOptions.useOpaquePointers = true;

// TODO: something useful here.
// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));

// Serialize all GPU modules to binaries.
GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));

// Convert vector to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
convertVectorToLLVMPassOptions.reassociateFPReductions = true;
pm.addNestedPass<func::FuncOp>(
createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));

ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
// TODO: fix GPU layering.
convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));

// Convert Func to LLVM (always needed).
// TODO: C++20 designated initializers.
ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
// Must be 64b on the host, things don't compose properly around
// gpu::LaunchOp and gpu::HostRegisterOp.
convertFuncToLLVMPassOptions2.indexBitwidth = options.hostIndexBitWidth;
convertFuncToLLVMPassOptions2.useBarePtrCallConv =
options.hostUseBarePtrCallConv;
convertFuncToLLVMPassOptions2.useOpaquePointers = true;
pm.addPass(createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions2));

// Sprinkle some cleanups.
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());

// Finally we can reconcile unrealized casts.
pm.addPass(createReconcileUnrealizedCastsPass());
buildHostPostPipeline(pm, options);
}
} // namespace

Expand Down