intel · akroviakov · Jan 21, 2025 · Jan 13, 2025 · Jan 16, 2025 · Jan 20, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
 ################################################################################
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -45,7 +45,8 @@ option(GC_ENABLE_TEST_DNNL_API "Build the dnnl tests" ${GC_ENABLE_DNNL_API})
 option(GC_ENABLE_TEST_MLIR "Build the mlir tests" ON)
 option(GC_ENABLE_TOOLS "Build the tools" ON)
 option(GC_ENABLE_OPT "Build gc-opt" ${GC_ENABLE_TOOLS})
-option(GC_ENABLE_IMEX "Enable Intel® Extension for MLIR" OFF)
+option(GC_ENABLE_IMEX "Enable Intel® Extension for MLIR (implicitly enables GPU compilation)" OFF)
+option(GC_ENABLE_GPU "Enable GPU runtime and tools components" OFF)
 option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
 option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
 option(GC_ENABLE_RUNTIME_NAIVE_BRGEMM "Use naive BRGEMM as runtime backend for debug purpose." OFF)
@@ -55,6 +56,10 @@ if(GC_ENABLE_LEGACY)
   add_subdirectory(legacy/core)
 endif()
 
+if (GC_ENABLE_GPU)
+  set(GC_ENABLE_GPU ON)
+endif()
+
 if (GC_ENABLE_IMEX)
   # normalize the value for lit config
   set(GC_ENABLE_IMEX ON)
@@ -70,6 +75,9 @@ endif()
 ############################## Targets #########################################
 # All common options, includes etc. are added to this interface target.
 add_library(GcInterface INTERFACE)
+if (GC_ENABLE_GPU)
+  target_compile_options(GcInterface INTERFACE -DGC_USE_GPU)
+endif()
 target_compile_features(GcInterface INTERFACE cxx_std_17)
 target_include_directories(GcInterface INTERFACE
   $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>

diff --git a/README.md b/README.md
@@ -76,5 +76,6 @@ Graph Compiler supports the following build-time options.
 | GC_ENABLE_TEST            | **ON**, OFF                            | Controls building the tests                                     |
 | GC_DEV_LINK_LLVM_DYLIB    | ON, **OFF**                            | Controls dynamic link LLVM/MLIR libraries, mainly for developer |
 | GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF                            | Controls building the Python API                                |
-| GC_ENABLE_IMEX            | ON, **OFF**                            | Whether to enable the GPU components                            |
+| GC_ENABLE_IMEX            | ON, **OFF**                            | Whether to enable the IMEX components                           |
+| GC_ENABLE_GPU             | ON, **OFF**                            | Whether to enable the GPU tools and components                            |
 
diff --git a/include/gc/Conversion/Passes.h b/include/gc/Conversion/Passes.h
@@ -10,6 +10,7 @@
 #define GC_CONVERSION_PASSES_H
 
 #include "gc/Conversion/XeVMToLLVM/XeVMToLLVM.h"
+#include "mlir/Pass/Pass.h"
 
 namespace mlir {
 

diff --git a/include/gc/Dialect/LLVMIR/XeVMOps.td b/include/gc/Dialect/LLVMIR/XeVMOps.td
@@ -221,6 +221,76 @@ def XeVM_BlockPrefetch2dOp : XeVM_Op<"blockprefetch2d">,
   let hasVerifier = 1;
 }
 
+def XeVM_MatrixElemType : AnyTypeOf<[AnyI8, AnyI16, AnyI32, F32, F16, BF16]>;
+
+/// Enum attribute of the different precision types.
+def XeVM_PrecisionTypeAttr : I32EnumAttr<"PrecisionType",
+  "XeVM precision type",
+  [
+    I32EnumAttrCase<"UNUSED", 0,  "unused">,
+    I32EnumAttrCase<"U8",     1,  "u8">,
+    I32EnumAttrCase<"U4",     2,  "u4">,
+    I32EnumAttrCase<"U2",     3,  "u2">,
+    I32EnumAttrCase<"S8",     4,  "i8">,
+    I32EnumAttrCase<"S4",     5,  "i4">,
+    I32EnumAttrCase<"S2",     6,  "i2">,
+    I32EnumAttrCase<"BF8",    7,  "bf8">,
+    I32EnumAttrCase<"TF32",   8,  "tf32">,
+    I32EnumAttrCase<"BF16",   9,  "bf16">,
+    I32EnumAttrCase<"FP16",   10, "f16">
+  ]> {
+  let cppNamespace = "::mlir::xevm";
+}
+
+def XeVM_DPASOp : XeVM_Op<"dpas">,
+  Results<(outs FixedVectorOf<[XeVM_MatrixElemType]>:$d)>,
+  Arguments<(ins
+    FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$c,
+    FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$a,
+    FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$b,
+    XeVM_PrecisionTypeAttr:$pa,
+    XeVM_PrecisionTypeAttr:$pb,
+    I32Attr:$rc
+  )> {
+
+  let summary = "Matrix multiply-add";
+
+  let description = [{
+    The `xevm.dpas` operation is a matrix multiplication plus accumulation:
+
+      D = C + A x B
+
+      where the A, B, C input matrices and the result D have shapes:
+        D : MxN
+        C : MxN
+        A : MxK
+        B : KxN
+
+        Shape restrictions:
+        M : must be 1, 2, 4, or 8
+        N : fixed execution size, must be 16
+        K : systolic_depth * OPS_PER_CHAN
+            OPS_PER_CHAN
+              1 : for TF32
+              2 : for 16-bit precision(BF, HF)
+              4 : for 8-bit precision (FP8, UB, B)
+              8 : for less-then 8 bit precision (U4/S4, U2/S2).
+
+            If systolic_depth is 8, K would be 8, 16, 32, or 64 (based on OPS_PER_CHAN).
+    $a, $b, $c, $d - matrix A, B, C, D, respectively
+    $pa, $pb - precision of matrix A and B resepectively
+    $rc - repeat count
+
+    Further restrictions as well as more details can be found here:
+    https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
+  }];
+
+  let assemblyFormat = [{
+    operands ` ` `{` `pa` `=` $pa `,` `pb` `=` $pb `,` `rc` `=` $rc `}` attr-dict `:` functional-type(operands, results)
+  }];
+
+  // let hasVerifier = 1;
+}
 
 def XeVM_TargetAttr : XeVM_Attr<"XeVMTarget", "target"> {
   let description = [{

diff --git a/include/gc/ExecutionEngine/Driver/Driver.h b/include/gc/ExecutionEngine/Driver/Driver.h
@@ -18,7 +18,7 @@ namespace mlir {
 class DialectRegistry;
 namespace gc {
 
-const DialectRegistry &initCompilerAndGetDialects();
+DialectRegistry &initCompilerAndGetDialects();
 
 // the pointers to XXXMemRefType
 using GeneralMemrefPtr = void *;

diff --git a/include/gc/Transforms/CMakeLists.txt b/include/gc/Transforms/CMakeLists.txt
@@ -1,8 +1,11 @@
 if(GC_ENABLE_DNNL_API)
     list(APPEND TABLEGEN_MACROS -DGC_HAS_ONEDNN_DIALECT)
 endif()
+if(GC_ENABLE_GPU)
+    list(APPEND TABLEGEN_MACROS -DGC_USE_GPU)
+endif()
 if(GC_ENABLE_IMEX)
-    list(APPEND TABLEGEN_MACROS -DGC_USE_IMEX)
+    list(APPEND TABLEGEN_MACROS -DGC_USE_IMEX -DGC_USE_GPU)
 endif()
 
 set(LLVM_TARGET_DEFINITIONS Passes.td)

diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h
@@ -115,7 +115,6 @@ std::unique_ptr<Pass> createMergeAllocPass();
 void populateFrontendPasses(mlir::OpPassManager &);
 void populateCPUPipeline(mlir::OpPassManager &);
 
-#ifdef GC_USE_IMEX
 struct GPUPipelineOptions : PassPipelineOptions<GPUPipelineOptions> {
   Option<bool> isUsmArgs{
       *this, "is-usm-args",
@@ -136,6 +135,8 @@ struct GPUPipelineOptions : PassPipelineOptions<GPUPipelineOptions> {
       llvm::cl::init(false)};
 };
 void populateGPUPipeline(mlir::OpPassManager &, const GPUPipelineOptions &);
+#ifdef GC_USE_IMEX
+void populateIMEXPipeline(mlir::OpPassManager &, const GPUPipelineOptions &);
 #endif
 
 #define GEN_PASS_DECL

diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
@@ -93,6 +93,21 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
                "DPAS register block sizes MxNxK">,
   ];
 }
+#endif
+
+#ifdef GC_USE_GPU
+def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
+  let summary = "Convert the GPU operations to GpuOclRuntime calls.";
+  let description = [{
+    Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls.
+  }];
+  let options = [
+    Option<"callFinish", "call-finish", "bool",
+           /*default=*/"false",
+           "Call finish() after each kernel launch.">
+    ];
+}
+#endif // GC_USE_GPU
 
 def AddContextArg : Pass<"add-ctx-arg", "func::FuncOp"> {
   let summary = "Add a context argument.";
@@ -109,17 +124,6 @@ def AllocsToSLM : Pass<"allocs-to-slm", "func::FuncOp"> {
   ];
 }
 
-def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
-  let summary = "Convert the GPU operations to GpuOclRuntime calls.";
-  let description = [{
-    Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls.
-  }];
-  let options = [
-    Option<"callFinish", "call-finish", "bool",
-           /*default=*/"false",
-           "Call finish() after each kernel launch.">
-    ];
-}
 
 def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
   let summary = "GPU tiling and fusion path.";
@@ -185,7 +189,6 @@ def GpuXeVMAttachTarget: Pass<"xevm-attach-target", ""> {
   ];
 }
 
-#endif // GC_USE_IMEX
 
 def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
                                         "func::FuncOp"> {

diff --git a/lib/gc/CAPI/CMakeLists.txt b/lib/gc/CAPI/CMakeLists.txt
@@ -4,9 +4,7 @@ set(GC_ALL_LIBS
   GcAnalysis
   MLIRCPURuntimeTransforms)
 
-if(GC_ENABLE_IMEX)
-  list(APPEND GC_ALL_LIBS GcGpuPasses)
-endif()
+list(APPEND GC_ALL_LIBS GcGpuPasses)
 
 add_mlir_public_c_api_library(GcCAPI
   Dialects.cpp