-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[MLIR] Fixes NVGPU Integration Test Passes Ordering #69934
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The test-`lower-to-nvvm pipeline`, designed for NVGPU dialect within GPU kernels, plays important role for compiling integration tests. Thiks PR restructured the passes, and cleaned up the code.
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) ChangesThe test- Full diff: https://github.com/llvm/llvm-project/pull/69934.diff 1 Files Affected:
diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
index b5af22f23a77cbc..c4cc0d5ae38d9be 100644
--- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp
@@ -28,6 +28,8 @@
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Linalg/Passes.h"
#include "mlir/Dialect/MemRef/Transforms/Passes.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Pass/PassOptions.h"
@@ -39,27 +41,11 @@ using namespace mlir;
namespace {
struct TestLowerToNVVMOptions
: public PassPipelineOptions<TestLowerToNVVMOptions> {
- PassOptions::Option<int64_t> hostIndexBitWidth{
- *this, "host-index-bitwidth",
+ PassOptions::Option<int64_t> indexBitWidth{
+ *this, "index-bitwidth",
llvm::cl::desc("Bitwidth of the index type for the host (warning this "
"should be 64 until the GPU layering is fixed)"),
llvm::cl::init(64)};
- PassOptions::Option<bool> hostUseBarePtrCallConv{
- *this, "host-bare-ptr-calling-convention",
- llvm::cl::desc(
- "Whether to use the bareptr calling convention on the host (warning "
- "this should be false until the GPU layering is fixed)"),
- llvm::cl::init(false)};
- PassOptions::Option<int64_t> kernelIndexBitWidth{
- *this, "kernel-index-bitwidth",
- llvm::cl::desc("Bitwidth of the index type for the GPU kernels"),
- llvm::cl::init(64)};
- PassOptions::Option<bool> kernelUseBarePtrCallConv{
- *this, "kernel-bare-ptr-calling-convention",
- llvm::cl::desc(
- "Whether to use the bareptr calling convention on the kernel "
- "(warning this should be false until the GPU layering is fixed)"),
- llvm::cl::init(false)};
PassOptions::Option<std::string> cubinTriple{
*this, "cubin-triple",
llvm::cl::desc("Triple to use to serialize to cubin."),
@@ -74,175 +60,78 @@ struct TestLowerToNVVMOptions
PassOptions::Option<std::string> cubinFormat{
*this, "cubin-format",
llvm::cl::desc("Compilation format to use to serialize to cubin."),
- llvm::cl::init("isa")};
+ llvm::cl::init("bin")};
PassOptions::Option<int> optLevel{
*this, "opt-level",
llvm::cl::desc("Optimization level for NVVM compilation"),
llvm::cl::init(2)};
};
+//===----------------------------------------------------------------------===//
+// Common pipeline
+//===----------------------------------------------------------------------===//
+void buildCommonPassPipeline(OpPassManager &pm,
+ const TestLowerToNVVMOptions &options) {
+ pm.addPass(createConvertNVGPUToNVVMPass());
+ pm.addPass(createGpuKernelOutliningPass());
+ pm.addPass(createConvertLinalgToLoopsPass());
+ pm.addPass(createConvertVectorToSCFPass());
+ pm.addPass(createConvertSCFToCFPass());
+ pm.addPass(createConvertNVVMToLLVMPass());
+ pm.addPass(createConvertVectorToLLVMPass());
+ pm.addPass(createConvertMathToLLVMPass());
+ pm.addPass(createFinalizeMemRefToLLVMConversionPass());
+ pm.addPass(createConvertFuncToLLVMPass());
+ pm.addPass(memref::createExpandStridedMetadataPass());
+
+ GpuNVVMAttachTargetOptions nvvmTargetOptions;
+ nvvmTargetOptions.triple = options.cubinTriple;
+ nvvmTargetOptions.chip = options.cubinChip;
+ nvvmTargetOptions.features = options.cubinFeatures;
+ nvvmTargetOptions.optLevel = options.optLevel;
+ pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
+ pm.addPass(createLowerAffinePass());
+ pm.addPass(createArithToLLVMConversionPass());
+ ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
+ convertIndexToLLVMPassOpt.indexBitwidth = options.indexBitWidth;
+ pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
+ pm.addPass(createCanonicalizerPass());
+ pm.addPass(createCSEPass());
+}
+
//===----------------------------------------------------------------------===//
// GPUModule-specific stuff.
//===----------------------------------------------------------------------===//
void buildGpuPassPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
+ pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
+ pm.addNestedPass<gpu::GPUModuleOp>(createCanonicalizerPass());
+ pm.addNestedPass<gpu::GPUModuleOp>(createCSEPass());
+ pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+}
- pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
- // Convert SCF to CF (always needed).
- pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
- // Convert Math to LLVM (always needed).
- pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToLLVMPass());
- // Expand complicated MemRef operations before lowering them.
- pm.addNestedPass<gpu::GPUModuleOp>(memref::createExpandStridedMetadataPass());
- // The expansion may create affine expressions. Get rid of them.
- pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());
-
- // Convert MemRef to LLVM (always needed).
- // TODO: C++20 designated initializers.
- FinalizeMemRefToLLVMConversionPassOptions
- finalizeMemRefToLLVMConversionPassOptions;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
- options.kernelIndexBitWidth;
- finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
- pm.addNestedPass<gpu::GPUModuleOp>(createFinalizeMemRefToLLVMConversionPass(
- finalizeMemRefToLLVMConversionPassOptions));
-
- // Convert Func to LLVM (always needed).
- // TODO: C++20 designated initializers.
- ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- convertFuncToLLVMPassOptions.indexBitwidth = options.kernelIndexBitWidth;
- convertFuncToLLVMPassOptions.useBarePtrCallConv =
- options.kernelUseBarePtrCallConv;
- convertFuncToLLVMPassOptions.useOpaquePointers = true;
- pm.addNestedPass<gpu::GPUModuleOp>(
- createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
- // TODO: C++20 designated initializers.
- ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- convertIndexToLLVMPassOpt.indexBitwidth = options.kernelIndexBitWidth;
- pm.addNestedPass<gpu::GPUModuleOp>(
- createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
- // TODO: C++20 designated initializers.
- // The following pass is inconsistent.
- // TODO: fix inconsistence.
- ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
- convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
- options.kernelUseBarePtrCallConv;
- convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
- convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
- pm.addNestedPass<gpu::GPUModuleOp>(
- createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
-
- pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
-
- // Convert vector to LLVM (always needed).
- // TODO: C++20 designated initializers.
- ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
- convertVectorToLLVMPassOptions.reassociateFPReductions = true;
- pm.addNestedPass<gpu::GPUModuleOp>(
- createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
- // This pass is needed for PTX building
- pm.addNestedPass<gpu::GPUModuleOp>(createConvertNVVMToLLVMPass());
+//===----------------------------------------------------------------------===//
+// Host Post-GPU pipeline
+//===----------------------------------------------------------------------===//
+void buildHostPostPipeline(OpPassManager &pm,
+ const TestLowerToNVVMOptions &options) {
+ pm.addPass(createGpuToLLVMConversionPass());
- // Sprinkle some cleanups.
+ GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+ gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
+ pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
pm.addPass(createCanonicalizerPass());
pm.addPass(createCSEPass());
-
- // Finally we can reconcile unrealized casts.
- pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
+ pm.addPass(createReconcileUnrealizedCastsPass());
}
void buildLowerToNVVMPassPipeline(OpPassManager &pm,
const TestLowerToNVVMOptions &options) {
- // Start with a cleanup pass.
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
-
//===----------------------------------------------------------------------===//
- // NVGPU lowers device code as well as host code to the driver, so must run
- // before outlining.
+ // Common pipeline
//===----------------------------------------------------------------------===//
- // TODO: C++20 designated initializers.
- ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
- convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
- pm.addNestedPass<func::FuncOp>(
- createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
-
- //===----------------------------------------------------------------------===//
- // Host-specific stuff.
- //===----------------------------------------------------------------------===//
- // Important, must be run at the top-level.
- pm.addPass(createGpuKernelOutliningPass());
-
- // Important, all host passes must be run at the func level so that host
- // conversions can remain with 64 bit indices without polluting the GPU
- // kernel that may have 32 bit indices.
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
- // Convert SCF to CF (always needed).
- pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
- // Convert Math to LLVM (always needed).
- pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
- // Expand complicated MemRef operations before lowering them.
- pm.addNestedPass<func::FuncOp>(memref::createExpandStridedMetadataPass());
- // The expansion may create affine expressions. Get rid of them.
- pm.addNestedPass<func::FuncOp>(createLowerAffinePass());
-
- // Convert MemRef to LLVM (always needed).
- // TODO: C++20 designated initializers.
- FinalizeMemRefToLLVMConversionPassOptions
- finalizeMemRefToLLVMConversionPassOptions;
- finalizeMemRefToLLVMConversionPassOptions.useAlignedAlloc = true;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
- options.hostIndexBitWidth;
- finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
- pm.addNestedPass<func::FuncOp>(createFinalizeMemRefToLLVMConversionPass(
- finalizeMemRefToLLVMConversionPassOptions));
-
- // Convert Func to LLVM (always needed).
- // TODO: C++20 designated initializers.
- ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- convertFuncToLLVMPassOptions.indexBitwidth = options.hostIndexBitWidth;
- convertFuncToLLVMPassOptions.useBarePtrCallConv =
- options.hostUseBarePtrCallConv;
- convertFuncToLLVMPassOptions.useOpaquePointers = true;
- pm.addNestedPass<func::FuncOp>(
- createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));
-
- // TODO: C++20 designated initializers.
- ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
- pm.addNestedPass<func::FuncOp>(
- createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));
-
- pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());
-
- // Sprinkle some cleanups.
- pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
- pm.addNestedPass<func::FuncOp>(createCSEPass());
+ buildCommonPassPipeline(pm, options);
//===----------------------------------------------------------------------===//
// GPUModule-specific stuff.
@@ -252,68 +141,7 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
//===----------------------------------------------------------------------===//
// Host post-GPUModule-specific stuff.
//===----------------------------------------------------------------------===//
- // Attach an NVVM target to all the GPU modules with the provided target
- // options.
- // TODO: C++20 designated initializers.
- GpuNVVMAttachTargetOptions nvvmTargetOptions;
- nvvmTargetOptions.triple = options.cubinTriple;
- nvvmTargetOptions.chip = options.cubinChip;
- nvvmTargetOptions.features = options.cubinFeatures;
- nvvmTargetOptions.optLevel = options.optLevel;
- pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
-
- // Convert GPU to LLVM.
- // TODO: C++20 designated initializers.
- GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
- // Note: hostBarePtrCallConv must be false for now otherwise
- // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
- // lower the to bare ptr.
- gpuToLLVMConversionOptions.hostBarePtrCallConv =
- options.hostUseBarePtrCallConv;
- gpuToLLVMConversionOptions.kernelBarePtrCallConv =
- options.kernelUseBarePtrCallConv;
- gpuToLLVMConversionOptions.useOpaquePointers = true;
-
- // TODO: something useful here.
- // gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
- pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
-
- // Serialize all GPU modules to binaries.
- GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
- gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat;
- pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
-
- // Convert vector to LLVM (always needed).
- // TODO: C++20 designated initializers.
- ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
- convertVectorToLLVMPassOptions.reassociateFPReductions = true;
- pm.addNestedPass<func::FuncOp>(
- createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));
-
- ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- // TODO: fix GPU layering.
- convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
- pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
-
- // Convert Func to LLVM (always needed).
- // TODO: C++20 designated initializers.
- ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
- // Must be 64b on the host, things don't compose properly around
- // gpu::LaunchOp and gpu::HostRegisterOp.
- convertFuncToLLVMPassOptions2.indexBitwidth = options.hostIndexBitWidth;
- convertFuncToLLVMPassOptions2.useBarePtrCallConv =
- options.hostUseBarePtrCallConv;
- convertFuncToLLVMPassOptions2.useOpaquePointers = true;
- pm.addPass(createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions2));
-
- // Sprinkle some cleanups.
- pm.addPass(createCanonicalizerPass());
- pm.addPass(createCSEPass());
-
- // Finally we can reconcile unrealized casts.
- pm.addPass(createReconcileUnrealizedCastsPass());
+ buildHostPostPipeline(pm, options);
}
} // namespace
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shall the title be "Refactor and cleanup the NVGPU Pass Pipeline for Integration Tests (NFC)" ? There is no "fix" here (or I'd ask for a test)
It is also a fix that reorders the passes. I mentioned the test in the PR. |
It does not show up on any test? Do you have one that is sensitive to the ordering by any chance? Edit, I see that the description mentions it is needed for #69913, LG then. |
Also should the title be "Fixes NVGPU Integration Test Passes Ordering" |
In the previous order, there were some 'unrealized_conversion_cast' left in the IR. However, in the current order, this issue has been resolved. Tbh I didn't deeply check the root cause, but the current order looked ok and solved the problem. |
This broke the build @grypp : https://lab.llvm.org/buildbot/#/builders/61/builds/50847 |
Strange, I submitted the #70052 that solved the issue on my local machine. I am looking at it. |
Latest has 2 failures instead of 4: |
llvm#69934 broke the build of some integration tests. Appereantly, these tests are relying two flags: kernel-bare-ptr-calling-convention, and host-bare-ptr-calling-convention. So this PR brings them back.
#69934 broke integration tests that rely on the kernel-bare-ptr-calling-convention and host-bare-ptr-calling-convention flags. This PR brings these flags. Also the kernel-index-bitwidth flag is removed, as kernel pointer size depends on the host. Separating host (64-bit) and kernel (32-bit) is not viable.
#70113 fixed the problems. The build is green again |
…#70028) PR #69913 added a GEMM test (128x128x128 F32 += F16 * F16) with if-statement. This PR adds the same test using predicates in PTX. Predicate support is enabled using _BasicPtxBuilderInterface_ `(nvgpu.opcode ..., predicate = %pred)`. The predicate condition is computed in `Step 2. [GPU] Elect fastest thread in CTA` inspired by cutlass. It is as follows: ``` lane_predicate = nvvm.elect.sync warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0) warp_idx_in_warp_group = warp_idx % 4 predicate = (lane_predicate & warp_idx_in_warp_group) ``` Depends on #70027 #69934 #69935 #69584
…llvm#70028) PR llvm#69913 added a GEMM test (128x128x128 F32 += F16 * F16) with if-statement. This PR adds the same test using predicates in PTX. Predicate support is enabled using _BasicPtxBuilderInterface_ `(nvgpu.opcode ..., predicate = %pred)`. The predicate condition is computed in `Step 2. [GPU] Elect fastest thread in CTA` inspired by cutlass. It is as follows: ``` lane_predicate = nvvm.elect.sync warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0) warp_idx_in_warp_group = warp_idx % 4 predicate = (lane_predicate & warp_idx_in_warp_group) ``` Depends on llvm#70027 llvm#69934 llvm#69935 llvm#69584
The test-
lower-to-nvvm pipeline
, designed for NVGPU dialect within GPU kernels, plays important role for compiling integration tests. This PR restructured the passes, and cleaned up the code. It also fixes the order of pipelines.This fix is needed for #69913