[MLIR] Fixes NVGPU Integration Test Passes Ordering (#69934)

grypp · web-flow · commit ba8ae9866be1 · 2023-10-24T15:56:47.000+02:00
The test-`lower-to-nvvm pipeline`, designed for NVGPU dialect within GPU kernels, plays important role for compiling integration tests. This PR restructured the passes, and cleaned up the code. It also fixes the order of pipelines. This fix is needed for #69913
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -28,6 +28,8 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
@@ -39,27 +41,11 @@ using namespace mlir;
 namespace {
 struct TestLowerToNVVMOptions
     : public PassPipelineOptions<TestLowerToNVVMOptions> {
-  PassOptions::Option<int64_t> hostIndexBitWidth{
-      *this, "host-index-bitwidth",
+  PassOptions::Option<int64_t> indexBitWidth{
+      *this, "index-bitwidth",
       llvm::cl::desc("Bitwidth of the index type for the host (warning this "
                      "should be 64 until the GPU layering is fixed)"),
       llvm::cl::init(64)};
-  PassOptions::Option<bool> hostUseBarePtrCallConv{
-      *this, "host-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the host (warning "
-          "this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
-  PassOptions::Option<int64_t> kernelIndexBitWidth{
-      *this, "kernel-index-bitwidth",
-      llvm::cl::desc("Bitwidth of the index type for the GPU kernels"),
-      llvm::cl::init(64)};
-  PassOptions::Option<bool> kernelUseBarePtrCallConv{
-      *this, "kernel-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the kernel "
-          "(warning this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
   PassOptions::Option<std::string> cubinTriple{
       *this, "cubin-triple",
       llvm::cl::desc("Triple to use to serialize to cubin."),
@@ -74,175 +60,78 @@ struct TestLowerToNVVMOptions
   PassOptions::Option<std::string> cubinFormat{
       *this, "cubin-format",
       llvm::cl::desc("Compilation format to use to serialize to cubin."),
-      llvm::cl::init("isa")};
+      llvm::cl::init("bin")};
   PassOptions::Option<int> optLevel{
       *this, "opt-level",
       llvm::cl::desc("Optimization level for NVVM compilation"),
       llvm::cl::init(2)};
 };
 
+//===----------------------------------------------------------------------===//
+// Common pipeline
+//===----------------------------------------------------------------------===//
+void buildCommonPassPipeline(OpPassManager &pm,
+                             const TestLowerToNVVMOptions &options) {
+  pm.addPass(createConvertNVGPUToNVVMPass());
+  pm.addPass(createGpuKernelOutliningPass());
+  pm.addPass(createConvertLinalgToLoopsPass());
+  pm.addPass(createConvertVectorToSCFPass());
+  pm.addPass(createConvertSCFToCFPass());
+  pm.addPass(createConvertNVVMToLLVMPass());
+  pm.addPass(createConvertVectorToLLVMPass());
+  pm.addPass(createConvertMathToLLVMPass());
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
+  pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(memref::createExpandStridedMetadataPass());
+
+  GpuNVVMAttachTargetOptions nvvmTargetOptions;
+  nvvmTargetOptions.triple = options.cubinTriple;
+  nvvmTargetOptions.chip = options.cubinChip;
+  nvvmTargetOptions.features = options.cubinFeatures;
+  nvvmTargetOptions.optLevel = options.optLevel;
+  pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
+  pm.addPass(createLowerAffinePass());
+  pm.addPass(createArithToLLVMConversionPass());
+  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
+  convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
+  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+}
+
 //===----------------------------------------------------------------------===//
 // GPUModule-specific stuff.
 //===----------------------------------------------------------------------===//
 void buildGpuPassPipeline(OpPassManager &pm,
                           const TestLowerToNVVMOptions &options) {
   pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
 
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
-  // Convert SCF to CF (always needed).
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-  // Convert Math to LLVM (always needed).
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToLLVMPass());
-  // Expand complicated MemRef operations before lowering them.
-  pm.addNestedPass<gpu::GPUModuleOp>(memref::createExpandStridedMetadataPass());
-  // The expansion may create affine expressions. Get rid of them.
-  pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());
-
-  // Convert MemRef to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  FinalizeMemRefToLLVMConversionPassOptions
-      finalizeMemRefToLLVMConversionPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
-      options.kernelIndexBitWidth;
-  finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(createFinalizeMemRefToLLVMConversionPass(
-      finalizeMemRefToLLVMConversionPassOptions));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertFuncToLLVMPassOptions.indexBitwidth = options.kernelIndexBitWidth;
-  convertFuncToLLVMPassOptions.useBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.kernelIndexBitWidth;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  // TODO: C++20 designated initializers.
-  // The following pass is inconsistent.
-  // TODO: fix inconsistence.
-  ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
-  convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
-  convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
-
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-
-  // Convert vector to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
-  convertVectorToLLVMPassOptions.reassociateFPReductions = true;
-  pm.addNestedPass<gpu::GPUModuleOp>(
-      createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
-  // This pass is needed for PTX building
-  pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+//===----------------------------------------------------------------------===//
+// Host Post-GPU pipeline
+//===----------------------------------------------------------------------===//
+void buildHostPostPipeline(OpPassManager &pm,
+                           const TestLowerToNVVMOptions &options) {
+  pm.addPass(createGpuToLLVMConversionPass());
 
-  // Sprinkle some cleanups.
+  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
-
-  // Finally we can reconcile unrealized casts.
-  pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+  pm.addPass(createReconcileUnrealizedCastsPass());
 }
 
 void buildLowerToNVVMPassPipeline(OpPassManager &pm,
                                   const TestLowerToNVVMOptions &options) {
-  // Start with a cleanup pass.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
   //===----------------------------------------------------------------------===//
-  // NVGPU lowers device code as well as host code to the driver, so must run
-  // before outlining.
+  // Common pipeline
   //===----------------------------------------------------------------------===//
-  // TODO: C++20 designated initializers.
-  ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
-  convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
-
-  //===----------------------------------------------------------------------===//
-  // Host-specific stuff.
-  //===----------------------------------------------------------------------===//
-  // Important, must be run at the top-level.
-  pm.addPass(createGpuKernelOutliningPass());
-
-  // Important, all host passes must be run at the func level so that host
-  // conversions can remain with 64 bit indices without polluting the GPU
-  // kernel that may have 32 bit indices.
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
-  // Convert SCF to CF (always needed).
-  pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
-  // Convert Math to LLVM (always needed).
-  pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
-  // Expand complicated MemRef operations before lowering them.
-  pm.addNestedPass<func::FuncOp>(memref::createExpandStridedMetadataPass());
-  // The expansion may create affine expressions. Get rid of them.
-  pm.addNestedPass<func::FuncOp>(createLowerAffinePass());
-
-  // Convert MemRef to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  FinalizeMemRefToLLVMConversionPassOptions
-      finalizeMemRefToLLVMConversionPassOptions;
-  finalizeMemRefToLLVMConversionPassOptions.useAlignedAlloc = true;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
-      options.hostIndexBitWidth;
-  finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(createFinalizeMemRefToLLVMConversionPass(
-      finalizeMemRefToLLVMConversionPassOptions));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertFuncToLLVMPassOptions.indexBitwidth = options.hostIndexBitWidth;
-  convertFuncToLLVMPassOptions.useBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions.useOpaquePointers = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
-  // TODO: C++20 designated initializers.
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
-  pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
-
-  // Sprinkle some cleanups.
-  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
-  pm.addNestedPass<func::FuncOp>(createCSEPass());
+  buildCommonPassPipeline(pm, options);
 
   //===----------------------------------------------------------------------===//
   // GPUModule-specific stuff.
@@ -252,68 +141,7 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
   //===----------------------------------------------------------------------===//
   // Host post-GPUModule-specific stuff.
   //===----------------------------------------------------------------------===//
-  // Attach an NVVM target to all the GPU modules with the provided target
-  // options.
-  // TODO: C++20 designated initializers.
-  GpuNVVMAttachTargetOptions nvvmTargetOptions;
-  nvvmTargetOptions.triple = options.cubinTriple;
-  nvvmTargetOptions.chip = options.cubinChip;
-  nvvmTargetOptions.features = options.cubinFeatures;
-  nvvmTargetOptions.optLevel = options.optLevel;
-  pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
-
-  // Convert GPU to LLVM.
-  // TODO: C++20 designated initializers.
-  GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
-  // Note: hostBarePtrCallConv must be false for now otherwise
-  // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
-  // lower the to bare ptr.
-  gpuToLLVMConversionOptions.hostBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  gpuToLLVMConversionOptions.kernelBarePtrCallConv =
-      options.kernelUseBarePtrCallConv;
-  gpuToLLVMConversionOptions.useOpaquePointers = true;
-
-  // TODO: something useful here.
-  // gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
-  pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
-
-  // Serialize all GPU modules to binaries.
-  GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
-  gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
-  pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
-
-  // Convert vector to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
-  convertVectorToLLVMPassOptions.reassociateFPReductions = true;
-  pm.addNestedPass<func::FuncOp>(
-      createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
-  ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  // TODO: fix GPU layering.
-  convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
-  pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
-
-  // Convert Func to LLVM (always needed).
-  // TODO: C++20 designated initializers.
-  ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
-  // Must be 64b on the host, things don't compose properly around
-  // gpu::LaunchOp and gpu::HostRegisterOp.
-  convertFuncToLLVMPassOptions2.indexBitwidth = options.hostIndexBitWidth;
-  convertFuncToLLVMPassOptions2.useBarePtrCallConv =
-      options.hostUseBarePtrCallConv;
-  convertFuncToLLVMPassOptions2.useOpaquePointers = true;
-  pm.addPass(createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions2));
-
-  // Sprinkle some cleanups.
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(createCSEPass());
-
-  // Finally we can reconcile unrealized casts.
-  pm.addPass(createReconcileUnrealizedCastsPass());
+  buildHostPostPipeline(pm, options);
 }
 } // namespace