llvm
diff --git a/‎mlir/docs/Dialects/GPU.md
Lines changed: 44 additions & 0 deletions b/‎mlir/docs/Dialects/GPU.md
Lines changed: 44 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
Lines changed: 56 additions & 0 deletions b/‎mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h
Lines changed: 56 additions & 0 deletions
diff --git a/‎mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
Lines changed: 15 additions & 63 deletions b/‎mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
Lines changed: 15 additions & 63 deletions
diff --git a/‎mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
Lines changed: 3 additions & 3 deletions b/‎mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
Lines changed: 3 additions & 3 deletions
diff --git a/‎mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
Lines changed: 2 additions & 2 deletions b/‎mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir
Lines changed: 2 additions & 2 deletions
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/all-reduce-maxsi.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/all-reduce-minsi.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
Lines changed: 1 addition & 1 deletion b/‎mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir
Lines changed: 1 addition & 1 deletion
@@ -60,6 +60,50 @@ mlir-translate example-nvvm.mlir        \
   -o example.ll
 ```
 
+### Default NVVM Compilation Pipeline: gpu-lower-to-nvvm-pipeline
+
+The `gpu-lower-to-nvvm-pipeline` compilation pipeline serves as the default way
+for NVVM target compilation within MLIR. This pipeline operates by lowering
+primary dialects (arith, memref, scf, vector, gpu, and nvgpu) to NVVM target. It
+begins by lowering GPU code region(s) to the specified NVVM compilation target
+and subsequently handles the host code.
+
+This pipeline specifically requires explicitly parallel IR and doesn't do GPU
+parallelization. To enable parallelism, necessary transformations must be
+applied before utilizing this pipeline.
+
+It's designed to provide a generic solution for NVVM targets, generating NVVM
+and LLVM dialect code compatible with `mlir-cpu-runner` or execution engine.
+
+#### Example:
+
+Here's a snippet illustrating the use of primary dialects, including arith,
+within GPU code execution:
+
+```
+func.func @main() {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    gpu.launch 
+        blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1) 
+        threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { 
+        gpu.printf "Hello from %d\n" %6 : index
+        gpu.terminator
+    }
+    return
+}
+```
+
+The `gpu-lower-to-nvvm` pipeline compiles this input code to NVVM format as
+below. It provides customization options like specifying SM capability, PTX
+version, and optimization level. Once compiled, the resulting IR is ready for
+execution using `mlir-cpu-runner`. Alternatively, it can be translated into
+LLVM, expanding its utility within the system.
+
+```
+mlir-opt example.mlir -gpu-lower-to-nvvm-pipeline = "cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3"
+```
+
 ### Module serialization
 Attributes implementing the GPU Target Attribute Interface handle the
 serialization process and are called Target attributes. These attributes can be
 
@@ -9,9 +9,65 @@
 #ifndef MLIR_DIALECT_GPU_PIPELINES_PASSES_H_
 #define MLIR_DIALECT_GPU_PIPELINES_PASSES_H_
 
+#include "mlir/Pass/PassOptions.h"
+
 namespace mlir {
 namespace gpu {
+
+/// Options for the gpu to nvvm pipeline.
+struct GPUToNVVMPipelineOptions
+    : public PassPipelineOptions<GPUToNVVMPipelineOptions> {
+  PassOptions::Option<int64_t> indexBitWidth{
+      *this, "index-bitwidth",
+      llvm::cl::desc("Bitwidth of the index type for the host (warning this "
+                     "should be 64 until the GPU layering is fixed)"),
+      llvm::cl::init(64)};
+  PassOptions::Option<std::string> cubinTriple{
+      *this, "cubin-triple",
+      llvm::cl::desc("Triple to use to serialize to cubin."),
+      llvm::cl::init("nvptx64-nvidia-cuda")};
+  PassOptions::Option<std::string> cubinChip{
+      *this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."),
+      llvm::cl::init("sm_50")};
+  PassOptions::Option<std::string> cubinFeatures{
+      *this, "cubin-features",
+      llvm::cl::desc("Features to use to serialize to cubin."),
+      llvm::cl::init("+ptx60")};
+  PassOptions::Option<std::string> cubinFormat{
+      *this, "cubin-format",
+      llvm::cl::desc("Compilation format to use to serialize to cubin."),
+      llvm::cl::init("fatbin")};
+  PassOptions::Option<int> optLevel{
+      *this, "opt-level",
+      llvm::cl::desc("Optimization level for NVVM compilation"),
+      llvm::cl::init(2)};
+  PassOptions::Option<bool> kernelUseBarePtrCallConv{
+      *this, "kernel-bare-ptr-calling-convention",
+      llvm::cl::desc(
+          "Whether to use the bareptr calling convention on the kernel "
+          "(warning this should be false until the GPU layering is fixed)"),
+      llvm::cl::init(false)};
+  PassOptions::Option<bool> hostUseBarePtrCallConv{
+      *this, "host-bare-ptr-calling-convention",
+      llvm::cl::desc(
+          "Whether to use the bareptr calling convention on the host (warning "
+          "this should be false until the GPU layering is fixed)"),
+      llvm::cl::init(false)};
+};
+
+//===----------------------------------------------------------------------===//
+// Building and Registering.
+//===----------------------------------------------------------------------===//
+
+/// Adds the GPU to NVVM pipeline to the given pass manager. Transforms main
+/// dialects into NVVM targets. Begins with GPU code regions, then handles host
+/// code.
+void buildLowerToNVVMPassPipeline(OpPassManager &pm,
+                                  const GPUToNVVMPipelineOptions &options);
+
+/// Register all pipeleines for the `gpu` dialect.
 void registerGPUToNVVMPipeline();
+
 } // namespace gpu
 } // namespace mlir
 
 
@@ -40,54 +40,14 @@ using namespace mlir;
 
 #if MLIR_CUDA_CONVERSIONS_ENABLED
 namespace {
-struct GPUToNVVMPipelineOptions
-    : public PassPipelineOptions<GPUToNVVMPipelineOptions> {
-  PassOptions::Option<int64_t> indexBitWidth{
-      *this, "index-bitwidth",
-      llvm::cl::desc("Bitwidth of the index type for the host (warning this "
-                     "should be 64 until the GPU layering is fixed)"),
-      llvm::cl::init(64)};
-  PassOptions::Option<std::string> cubinTriple{
-      *this, "cubin-triple",
-      llvm::cl::desc("Triple to use to serialize to cubin."),
-      llvm::cl::init("nvptx64-nvidia-cuda")};
-  PassOptions::Option<std::string> cubinChip{
-      *this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."),
-      llvm::cl::init("sm_50")};
-  PassOptions::Option<std::string> cubinFeatures{
-      *this, "cubin-features",
-      llvm::cl::desc("Features to use to serialize to cubin."),
-      llvm::cl::init("+ptx60")};
-  PassOptions::Option<std::string> cubinFormat{
-      *this, "cubin-format",
-      llvm::cl::desc("Compilation format to use to serialize to cubin."),
-      llvm::cl::init("fatbin")};
-  PassOptions::Option<int> optLevel{
-      *this, "opt-level",
-      llvm::cl::desc("Optimization level for NVVM compilation"),
-      llvm::cl::init(2)};
-  PassOptions::Option<bool> kernelUseBarePtrCallConv{
-      *this, "kernel-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the kernel "
-          "(warning this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
-  PassOptions::Option<bool> hostUseBarePtrCallConv{
-      *this, "host-bare-ptr-calling-convention",
-      llvm::cl::desc(
-          "Whether to use the bareptr calling convention on the host (warning "
-          "this should be false until the GPU layering is fixed)"),
-      llvm::cl::init(false)};
-};
 
 //===----------------------------------------------------------------------===//
 // Common pipeline
 //===----------------------------------------------------------------------===//
-void buildCommonPassPipeline(OpPassManager &pm,
-                             const GPUToNVVMPipelineOptions &options) {
+void buildCommonPassPipeline(
+    OpPassManager &pm, const mlir::gpu::GPUToNVVMPipelineOptions &options) {
   pm.addPass(createConvertNVGPUToNVVMPass());
   pm.addPass(createGpuKernelOutliningPass());
-  pm.addPass(createConvertLinalgToLoopsPass());
   pm.addPass(createConvertVectorToSCFPass());
   pm.addPass(createConvertSCFToCFPass());
   pm.addPass(createConvertNVVMToLLVMPass());
@@ -114,7 +74,7 @@ void buildCommonPassPipeline(OpPassManager &pm,
 // GPUModule-specific stuff.
 //===----------------------------------------------------------------------===//
 void buildGpuPassPipeline(OpPassManager &pm,
-                          const GPUToNVVMPipelineOptions &options) {
+                          const mlir::gpu::GPUToNVVMPipelineOptions &options) {
   pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
   ConvertGpuOpsToNVVMOpsOptions opt;
   opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv;
@@ -129,7 +89,7 @@ void buildGpuPassPipeline(OpPassManager &pm,
 // Host Post-GPU pipeline
 //===----------------------------------------------------------------------===//
 void buildHostPostPipeline(OpPassManager &pm,
-                           const GPUToNVVMPipelineOptions &options) {
+                           const mlir::gpu::GPUToNVVMPipelineOptions &options) {
   GpuToLLVMConversionPassOptions opt;
   opt.hostBarePtrCallConv = options.hostUseBarePtrCallConv;
   opt.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv;
@@ -143,36 +103,28 @@ void buildHostPostPipeline(OpPassManager &pm,
   pm.addPass(createReconcileUnrealizedCastsPass());
 }
 
-void buildLowerToNVVMPassPipeline(OpPassManager &pm,
-                                  const GPUToNVVMPipelineOptions &options) {
-  //===----------------------------------------------------------------------===//
-  // Common pipeline
-  //===----------------------------------------------------------------------===//
+} // namespace
+
+void mlir::gpu::buildLowerToNVVMPassPipeline(
+    OpPassManager &pm, const GPUToNVVMPipelineOptions &options) {
+  // Common pipelines
   buildCommonPassPipeline(pm, options);
 
-  //===----------------------------------------------------------------------===//
-  // GPUModule-specific stuff.
-  //===----------------------------------------------------------------------===//
+  // GPUModule-specific stuff
   buildGpuPassPipeline(pm, options);
 
-  //===----------------------------------------------------------------------===//
-  // Host post-GPUModule-specific stuff.
-  //===----------------------------------------------------------------------===//
+  // Host post-GPUModule-specific stuff
   buildHostPostPipeline(pm, options);
 }
-} // namespace
 
-namespace mlir {
-namespace gpu {
-void registerGPUToNVVMPipeline() {
+void mlir::gpu::registerGPUToNVVMPipeline() {
   PassPipelineRegistration<GPUToNVVMPipelineOptions>(
-      "gpu-lower-to-nvvm",
-      "The default pipeline lowers main dialects (arith, linalg, memref, scf, "
+      "gpu-lower-to-nvvm-pipeline",
+      "The default pipeline lowers main dialects (arith, memref, scf, "
       "vector, gpu, and nvgpu) to NVVM. It starts by lowering GPU code to the "
       "specified compilation target (default is fatbin) then lowers the host "
       "code.",
       buildLowerToNVVMPassPipeline);
 }
-} // namespace gpu
-} // namespace mlir
+
 #endif // MLIR_CUDA_CONVERSIONS_ENABLED
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:  | mlir-opt -gpu-lower-to-nvvm -debug-only=serialize-to-isa \
+// RUN:  | mlir-opt -gpu-lower-to-nvvm-pipeline -debug-only=serialize-to-isa \
 // RUN:  2>&1 | FileCheck %s
 
 // CHECK: Generated by LLVM NVPTX Back-End
 
@@ -4,7 +4,7 @@
 // RUN: mlir-opt \
 // RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
 // RUN: %s \
-// RUN: | mlir-opt --gpu-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt --gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_c_runner_utils \
 
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
 // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
 // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
-// RUN:  -convert-arith-to-llvm -gpu-lower-to-nvvm | \
+// RUN:  -convert-arith-to-llvm -gpu-lower-to-nvvm-pipeline | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
 
@@ -2,7 +2,7 @@
 // everything on the same thread.
 // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -gpu-lower-to-nvvm | \
+// RUN:  -gpu-lower-to-nvvm-pipeline | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
@@ -13,7 +13,7 @@
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -gpu-lower-to-nvvm | \
+// RUN:  -gpu-lower-to-nvvm-pipeline | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
@@ -23,7 +23,7 @@
 // RUN: mlir-opt %s  -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
 // RUN:   -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
 // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
-// RUN:  -gpu-lower-to-nvvm | \
+// RUN:  -gpu-lower-to-nvvm-pipeline | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_cuda_runtime \
 // RUN:   -shared-libs=%mlir_c_runner_utils \
 
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN:  -transform-interpreter \
 // RUN:  -test-transform-dialect-erase-schedule \
-// RUN:  -gpu-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
+// RUN:  -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -11,7 +11,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -transform-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
-// RUN:   -gpu-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
+// RUN:   -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -3,7 +3,7 @@
 // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
 // This test also uses gpu.memcpy operations (instead of gpu.host_register).
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --entry-point-result=void \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
@@ -8,7 +8,7 @@
 
 // Same as above but with the memref bare pointer lowering convention.
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \
 
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -gpu-lower-to-nvvm="cubin-format=%gpu_compilation_format" \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_runner_utils \