intel
diff --git a/‎.github/dockerfiles/env/Dockerfile
Lines changed: 1 addition & 1 deletion b/‎.github/dockerfiles/env/Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/dockerfiles/env/build.sh
Lines changed: 1 addition & 1 deletion b/‎.github/dockerfiles/env/build.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/dockerfiles/env/entrypoint.sh
Lines changed: 0 additions & 2 deletions b/‎.github/dockerfiles/env/entrypoint.sh
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/dockerfiles/runner/Dockerfile
Lines changed: 3 additions & 2 deletions b/‎.github/dockerfiles/runner/Dockerfile
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/dockerfiles/runner/build.sh
Lines changed: 1 addition & 1 deletion b/‎.github/dockerfiles/runner/build.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build-llvm.yml
Lines changed: 7 additions & 2 deletions b/‎.github/workflows/build-llvm.yml
Lines changed: 7 additions & 2 deletions
diff --git a/‎.github/workflows/build.yml
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/build.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/clang-tidy.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/clang-tidy.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/license.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/license.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 13 additions & 5 deletions b/‎CMakeLists.txt
Lines changed: 13 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 12 additions & 1 deletion b/‎README.md
Lines changed: 12 additions & 1 deletion
diff --git a/‎cmake/functions.cmake
Lines changed: 1 addition & 1 deletion b/‎cmake/functions.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/imex.cmake
Lines changed: 20 additions & 0 deletions b/‎cmake/imex.cmake
Lines changed: 20 additions & 0 deletions
diff --git a/‎cmake/llvm-version.txt
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-version.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/CPU_pipeline_overview.md
Lines changed: 113 additions & 0 deletions b/‎docs/CPU_pipeline_overview.md
Lines changed: 113 additions & 0 deletions
diff --git a/‎docs/dialect_overview.png
83.1 KB b/‎docs/dialect_overview.png
83.1 KB
diff --git a/‎include/gc-c/Passes.h
Lines changed: 3 additions & 0 deletions b/‎include/gc-c/Passes.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/gc/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎include/gc/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/gc/Dialect/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎include/gc/Dialect/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
@@ -74,7 +74,7 @@ RUN echo "conda prefix after installing jupyterhub: $(du --human-readable --summ
 # install infractl, lit
 RUN set -e; \
     . ${CONDA_PREFIX}/etc/profile.d/conda.sh; \
-    conda create --name python-3.9 python=3.9 ipykernel s3cmd dvc; \
+    conda create --name python-3.9 python=3.9 ipykernel s3cmd dvc numpy; \
     conda activate python-3.9; \
     pip --no-cache-dir install infractl lit
 
 
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 REGISTRY=localhost:5000
-TAG=graph-compiler-env:0.0.11
+TAG=graph-compiler-env:0.0.13
 
 set -e
 
 
@@ -58,6 +58,4 @@ jupyter kernelspec remove -y python3 || true
 # Remove lost+found
 rm -rf lost+found
 
-find ~/.conda -name '*singleuser'
-
 exec jupyterhub-singleuser
@@ -10,13 +10,14 @@ RUN set -ex; \
     curl -sSL https://cli.github.com/packages/githubcli-archive-keyring.gpg > /usr/share/keyrings/githubcli-archive-keyring.gpg; \
     apt-get update -y; \
     apt-get install -y --no-install-recommends --fix-missing \
-      python3-pip \
+      python3-pip python3-dev \
       cmake gcc g++ ninja-build git clang-format \
       gh \
+      libomp-dev \
     ; \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip install lit
+RUN pip install lit numpy
 
 USER runner
 WORKDIR $HOME
 
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 REGISTRY=localhost:5000
-TAG=graph-compiler-runner:latest
+TAG=graph-compiler-runner:0.0.2
 
 kubectl -n docker-registry port-forward svc/docker-registry 5000:5000 &
 
 
@@ -2,6 +2,10 @@ name: LLVM Build
 
 on:
   workflow_dispatch:
+  push:
+    paths:
+      - cmake/llvm-version.txt
+      - .github/workflows/build-llvm.yml
 
 permissions: read-all
 
@@ -24,9 +28,10 @@ jobs:
 
       - name: Build
         run: |
+          python3 -m pip install -r mlir/python/requirements.txt
           mkdir llvm-install
-          cmake -G Ninja llvm -B build -DCMAKE_INSTALL_PREFIX=llvm-install \
-            -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=true -DLLVM_ENABLE_PROJECTS="mlir" -DLLVM_TARGETS_TO_BUILD="X86" -DLLVM_INSTALL_UTILS=true -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DLLVM_INSTALL_GTEST=ON
+          cmake -G Ninja llvm -B build -DCMAKE_INSTALL_PREFIX=llvm-install -DMLIR_ENABLE_BINDINGS_PYTHON=ON -DPython3_EXECUTABLE=$(which python3) \
+            -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=true -DLLVM_ENABLE_PROJECTS="mlir" -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="SPIRV" -DLLVM_TARGETS_TO_BUILD="X86" -DLLVM_INSTALL_UTILS=true -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DLLVM_INSTALL_GTEST=ON
           cmake --build build --target install
           cd llvm-install
           tar -zcf ../llvm.tgz .
 
@@ -1,6 +1,7 @@
 name: Graph Compiler build
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - main
@@ -21,6 +22,22 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Set LLVM hash
+        run: |
+          echo LLVM_HASH=$(cat cmake/llvm-version.txt) >>$GITHUB_ENV
+
+      - name: Fetch requirements for python binding
+        uses: actions/checkout@v4
+        with:
+          repository: llvm/llvm-project
+          ref: ${{ env.LLVM_HASH }}
+          sparse-checkout: mlir/python/requirements.txt
+          sparse-checkout-cone-mode: false
+          path: llvm-dep
+
+      - name: Install requirements
+        run: python3 -m pip install -r llvm-dep/mlir/python/requirements.txt
+
       - name: Build
         run: |
           scripts/compile.sh
 
@@ -62,7 +62,7 @@ jobs:
     - name: Get changed files
       run: |
         cd graph-compiler
-        echo "CHANGED_FILES=$(git diff --name-only $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd' ')" >> $GITHUB_ENV
+        echo "CHANGED_FILES=$(git diff --name-only --diff-filter=d $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd' ')" >> $GITHUB_ENV
         
     - name: Prepare Environment
       shell: bash
@@ -102,4 +102,4 @@ jobs:
       shell: bash
       run: |
         cd build
-        python3 ../llvm-project/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py -warnings-as-errors=* -p ./ -config-file ../llvm-project/mlir/.clang-tidy -clang-tidy-binary $(which clang-tidy) ${{ env.CHANGED_FILES }}
+        python3 ../llvm-project/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py -warnings-as-errors=* -p ./ -config-file ../llvm-project/mlir/.clang-tidy -clang-tidy-binary $(which clang-tidy-15) ${{ env.CHANGED_FILES }}
@@ -26,7 +26,7 @@ jobs:
 
       - name: Get changed files
         run: |
-          echo "CHANGED_FILES=`git diff --name-only $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd,`" >> $GITHUB_ENV
+          echo "CHANGED_FILES=`git diff --name-only --diff-filter=d $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd,`" >> $GITHUB_ENV
 
       - name: Perform license check
         run: "python scripts/license.py --files $CHANGED_FILES"
@@ -20,13 +20,15 @@ project(GraphCompiler VERSION "0.1.0" LANGUAGES C CXX)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
 option(GC_LEGACY_ENABLE ON)
 option(GC_TEST_ENABLE "Build the tests" ON)
+option(GC_USE_GPU "Enable GPU backend" OFF)
 option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
 option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
 
@@ -51,6 +53,13 @@ include(AddLLVM)
 include(AddMLIR)
 include(HandleLLVMOptions)
 
+if(GC_USE_GPU)
+  include(imex)
+  if(GC_DEV_LINK_LLVM_DYLIB)
+    message(WARN "GPU backend may not be compatible with dynamic linking to LLVM")
+  endif()
+endif()
+
 if(GC_ENABLE_BINDINGS_PYTHON AND NOT MLIR_ENABLE_BINDINGS_PYTHON)
   message(STATUS "Failed to enable Python API due to the 'MLIR_ENABLE_BINDINGS_PYTHON' for LLVM is not ON.")
   set(GC_ENABLE_BINDINGS_PYTHON OFF CACHE BOOL "" FORCE)
@@ -95,13 +104,12 @@ if(GC_ENABLE_BINDINGS_PYTHON)
 endif()
 
 set(GC_LIB_LINKED_LIBS
-        GCPasses
-        GCAnalysis
-        MLIROneDNNGraph
+        GCJitWrapper
+        GCCpuRuntime
 )
-add_library(graph_compiler SHARED ${GC_LIB_SOURCES})
+add_mlir_library(graph_compiler SHARED ${GC_LIB_SOURCES})
 target_include_directories(graph_compiler PUBLIC ${GC_LIB_INCLUDES})
-target_compile_options(graph_compiler PRIVATE -fvisibility=hidden)
+target_compile_options(graph_compiler PRIVATE -fvisibility=hidden -fexceptions)
 target_link_options(graph_compiler PRIVATE -Wl,--gc-sections)
 target_link_libraries(graph_compiler PRIVATE ${GC_LIB_LINKED_LIBS})
 
 
@@ -32,8 +32,9 @@ cmake --build build --target install
 ```
 
 Notes
- * It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
+ * It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above **if you are building for CPU only**. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
  * The option `-DLLVM_INSTALL_GTEST=ON` is optional, if the tests of graph-compiler are disabled (see `GC_TEST_ENABLE` below).
+ * If you would like to enable GPU components of Graph Compiler, please make sure to statically link Graph Compiler and LLVM(MLIR). It is a known issue that LLVM shared library cannot be linked together with IGC (Intel's low level GPU compiler). Make sure `LLVM_BUILD_LLVM_DYLIB` and `LLVM_LINK_LLVM_DYLIB` are `OFF` (they are off by default). Also make sure Graph Compiler's cmake option `GC_DEV_LINK_LLVM_DYLIB` is `OFF` when configuring Graph Compiler (see below).
 
 We have now installed LLVM at `llvm-project/llvm-install`.
 
@@ -58,6 +59,15 @@ Notes:
  * `/PATH/TO/llvm-project/llvm-install` should be the install path of LLVM. If you installed LLVM elsewhere by `-DCMAKE_INSTALL_PREFIX` option when building LLVM, you need to change the path in `-DMLIR_DIR` accordingly.
  *  The cmake option `-DLLVM_EXTERNAL_LIT` is for the tests of this project. It requires the `lit` tool to be installed in the system. You can install it via `pip install lit`. If you don't need to run the tests of this repo, you can omit this option in the command line.
 
+More notes if GPU components are on (`-DGC_USE_GPU=ON`):
+ * make sure the OpenCL runtime is installed in your system. You can either
+  install using OS-provided package (Ubuntu 22.04)
+```sh
+sudo apt install -y intel-opencl-icd opencl-c-headers
+```
+  Or, download and install package from: https://github.com/intel/compute-runtime/releases
+ * the LLVM codebase needs to be patched to support XeGPU lowering (from IMEX). Please follow instructions of [IMEX](https://github.com/intel/mlir-extensions) on patching LLVM.
+
 Graph Compiler supports the following build-time options.
 
 | CMake Option                    | Supported values (defaults in bold)    | Description                                                                            |
@@ -66,4 +76,5 @@ Graph Compiler supports the following build-time options.
 | GC_TEST_ENABLE                  | **ON**, OFF                            | Controls building the tests                                                            |
 | GC_DEV_LINK_LLVM_DYLIB          | ON, **OFF**                            | Controls dynamic link LLVM/MLIR libraries, mainly for developer                        |
 | GC_ENABLE_BINDINGS_PYTHON       | **ON**, OFF                            | Controls building the Python API                                                       |
+| GC_USE_GPU          | ON, **OFF**                            | Whether to enable the GPU components                        |
 
@@ -31,7 +31,7 @@ function(gc_fetch_content
         FetchContent_Declare(
                 ${name}
                 SOURCE_DIR ${GC_${uname}_SRC_DIR}
-                CMAKE_ARGS ${${uname}_CMAKE_ARGS}
+                CMAKE_ARGS ${GC_${uname}_CMAKE_ARGS}
         )
     else ()
         if (DEFINED GC_${uname}_VERSION)
 
@@ -0,0 +1,20 @@
+include_guard()
+
+get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
+if (NOT DEFINED IMEX_INCLUDES)
+    include(functions)
+    set(IMEX_CHECK_LLVM_VERSION ON)
+    set(IMEX_ENABLE_L0_RUNTIME 0)
+    # TODO: Change to main https://github.com/oneapi-src/oneDNN.git when all the
+    # required functionality is merged.
+    gc_fetch_content(imex 496b240093b5e132b60c5ee69878300fe69be300 https://github.com/Menooker/mlir-extensions
+            CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=0"
+    )
+
+    set(IMEX_INCLUDES
+            ${imex_BINARY_DIR}/include
+            ${imex_SOURCE_DIR}/include
+            ${imex_SOURCE_DIR}/src
+    )
+    set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
+endif ()
@@ -1 +1 @@
-37661a17e26d9002ae9ade8c0de3932c22f16360
+89946bda5e1c7ceaf6d26634cc8c8c9498d9f7be
@@ -0,0 +1,113 @@
+# Graph Compiler CPU Compilation Flow Overview
+
+Graph Compiler is an MLIR based end-to-end DL compiler. The entire compilation process is divided into front-end, middle-end and back-end. Different compilation stages will use different combinations of dialects, and together with various transformation passes to perform various optimizations and graph lowering transformations. The entire process will transform IR from hardware-independent abstract expression to hardware-related concrete expression, and finally generate an executable kernel.
+
+Meanwhile, as an MLIR down-stream project, Graph Compiler's implementation not only uses the existing dialects and passes from MLIR up-stream, but also defines new dialects and passes. Most of the new implementations are upstream-able, and we will do so in the future.
+
+The content introduced in this document does not represent the current implemented status, but the target status after the implementation is completed.
+
+### Front-End
+
+The Graph Compiler front-end takes OneDNN Graph dialect as input. oneDNN Graph dialect is a newly defined dialect, which aims to describe the computation graph defined by oneDNN Graph. The ops in Dialect follow the [oneDNN Graph specification](https://oneapi-src.github.io/oneDNN/graph_supported_operations.html).
+
+oneDNN graph dialect example:
+
+```mlir
+func.func @mlp(%in: tensor<128x512xbf16>,
+               %weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
+  // layer 0
+  %0 = onednn_graph.matmul %in, %weight0, %bias0 : (tensor<128x512xbf16>, tensor<512x256xbf16>, tensor<256xbf16>) -> tensor<128x256xbf16>
+  %1 = onednn_graph.relu %0 : (tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  return %1 : tensor<128x256xbf16>
+}
+```
+
+There's no planned optimization passe in front-end. The only transformation pass is to lowering OneDNN Graph dialect into Linalg dialect.
+
+### Middle-End
+
+Middle-end is mainly responsible for general optimizations that are independent of the target hardware, and most of the transformations apply to both CPU and GPU. Some of the transformations need to query the target hardware information, such as cache level and capacity. The Hardware abstract layer(HAL) is the interface for abstracting and describing the target hardware information. Therefore, the same pass can generate different optimization results for different hardware under the guidance of HAL.
+
+According to the different dialect combinations used, middle-end is divided into the following stages:
+
+#### Linalg on Tensor
+
+This is the intermediate representation closest to the framework calculation graph. The example IR looks like:
+
+```mlir
+func.func @mlp(%in: tensor<128x512xbf16>,
+               %weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
+  %0 = tensor.empty() : tensor<128x256xbf16>
+  %cst = arith.constant 0.000000e+00 : bf16
+  %1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  %2 = linalg.matmul ins(%in, %weight0 : tensor<128x512xbf16>, tensor<512x256xbf16>) outs(%1 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  %3 = tensor.empty() : tensor<128x256xbf16>
+  %broadcasted = linalg.broadcast ins(%bias0 : tensor<256xbf16>) outs(%3 : tensor<128x256xbf16>) dimensions = [0]
+  %4 = tensor.empty() : tensor<128x256xbf16>
+  %5 = linalg.add ins(%2, %broadcasted : tensor<128x256xbf16>, tensor<128x256xbf16>) outs(%4: tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  %6 = tensor.empty() : tensor<128x256xbf16>
+  %7 = linalgx.relu ins(%5 : tensor<128x256xbf16>) outs(%6 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
+  return %7 : tensor<128x256xbf16>
+}
+```
+
+In this stage, GC will perform some analysis and transformation related to the whole graph. The main transformations include:
+
+* Padding propagation : insert tensor.pad op to adjust tensor shape if the shape is not divisible for target tiling size.
+* Layout propagation : insert tensor.pack and tensor.unpack to adjust tensor layout if blocking layout is preferred.
+* Tensor constant propagation : identify folding with constant tensor and build folding block.
+* Matmul lowering : lower Linalg.matmul into scf.forall with linalg.batch_reduce_matmul.
+* Fine-grain fusion: fuse element-wise/broadcast/reduce/movement ops into base op(e.g. matmul).
+* Lower linalg to arith/math on virtual vector : lower Linalg to Arith/Math and tiling tensor to virtual vector.
+
+### Tensor and scf loop with arith/math on virtual vector
+
+In this stage, most of the Linalg ops are lowered to Scf loops with Arith and Math ops. Both Arith and Math ops use tile tensor as input and output. The tile tensor here can be multi-dimensional tensor in any shape, regardless of the hardware register width. The tile size is chosen based on L1 cache capacity, that is, it is a good abstraction to partition the problem size to this granularity, since the microkernel, pre-op, and post-op, works at the tensor size fitting within l1 cache size. Meanwhile, converting Linalg into Arith and Math can further expose the implementation details of Linalg ops, which allow us to further simplify the computation after fusion.
+
+IR example:
+
+```mlir
+func.func @add_tensor(%arg0: tensor<4x8x31xf32>, %arg1: tensor<4x8x31xf32>) -> tensor<4x8x31xf32> {
+  %0 = tensor.empty() : tensor<4x8x31xf32>
+  %init = arith.constant 0: index
+  %c1 = arith.constant 1: index
+  %first_dim = arith.constant 4: index
+  %second_dim = arith.constant 8: index
+  // assume our tile shape is [31]
+  %third_dim = arith.constant 31: index
+  scf.for %c5 = %init to %first_dim step %c1 {
+    scf.for %c6 = %init to %second_dim step %c1 {
+        scf.for %c7 = %init to %third_dim step %c1 {
+          %1 =  vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
+          %2 =  vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
+          %3 = arith.add %1, %2 : vector<31xf32>
+          vector.transfer_write %3, %0[%c5, %c6, %c7] : vector<31xf32>, tensor<31xf32>
+        }
+    }
+  }
+  return %0: tensor<4x8x31xf32>
+}
+```
+
+The main transformations in this stage include:
+* Bfloat16 promotion and cast eliminatation : legalize the Arith and Math ops by inserting `arith.extf` and `arith.truncf` pairs if target device doesn't support, remove pair of redundant `arith.extf` and `arith.truncf` pairs to improve performance and accuracy.
+* Lower to physical vector : Lower virtual vector to physical vector based on physical register width of target device.
+
+### Backend-End
+
+Back-end is responsible for device dependent optimization. The use of dialect will vary with the target device. This document will focus on the backend implementation for CPU.
+
+The implementation of BRGEMM is the key to CPU performance.In GC we plan to introduce two different implementations:
+
+* The BRGEMM provided by the library, such as onednn. In order to better abstract and describe the kernel provided by the library, we introduced the microkernel dialect.
+
+* The BRGEMM generated by MLIR. In this approach, The AMX dialect will be used to simplify tile config processing and optimization.
+
+By default GC will use openmp dialect to handle task parallelism. But for better performance and support for non-openmp threadpools, we also introduced the CPURuntime dialect. This dialect also introduces some runtime function calls specifically designed for the CPU, such as thread-local memory allocator, which can improve performance on the CPU.
+
+The main transformations are:
+* Memref lowering and scheduling : lower tensor dialect to memref dialect and perform memory related optimization including memory hoist and rescheduling.
+* Microkernel dialect and lowering : lower linalg.batch_reduce_matmul to microkernel dialect and further lower to a function call to dnnl brgemm, or an MLIR-based brgemm implementation.
+* Parallelcpu dialect and lowering : lower to parallelcpu dialect for Nested parallel loop support and other CPU runtime calls.
+
+In the last step, everything will lower to LLVM dialect. We don't plan to introduce any transformation on LLVM dialect, just leverage the upstream implementation for this.
@@ -26,6 +26,9 @@ extern "C" {
 
 #include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.capi.h.inc"
 #include "gc/Transforms/Passes.capi.h.inc"
+
+MLIR_CAPI_EXPORTED void mlirRegisterAllGCPassesAndPipelines(void);
+
 #ifdef __cplusplus
 }
 #endif
 
@@ -1,2 +1,2 @@
 add_subdirectory(Dialect)
-add_subdirectory(Transforms)
+add_subdirectory(Transforms)
@@ -1,4 +1,4 @@
 add_subdirectory(CPURuntime)
 add_subdirectory(OneDNNGraph)
 add_subdirectory(Microkernel)
-add_subdirectory(Linalgx)
+add_subdirectory(Linalgx)
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ function(gc_fetch_content`
`31`	`31`	`FetchContent_Declare(`
`32`	`32`	`${name}`
`33`	`33`	`SOURCE_DIR ${GC_${uname}_SRC_DIR}`
`34`		`- CMAKE_ARGS ${${uname}_CMAKE_ARGS}`
	`34`	`+ CMAKE_ARGS ${GC_${uname}_CMAKE_ARGS}`
`35`	`35`	`)`
`36`	`36`	`else ()`
`37`	`37`	`if (DEFINED GC_${uname}_VERSION)`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-37661a17e26d9002ae9ade8c0de3932c22f16360`
	`1`	`+89946bda5e1c7ceaf6d26634cc8c8c9498d9f7be`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,9 @@ extern "C" {`
`26`	`26`
`27`	`27`	`#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.capi.h.inc"`
`28`	`28`	`#include "gc/Transforms/Passes.capi.h.inc"`
	`29`	`+`
	`30`	`+MLIR_CAPI_EXPORTED void mlirRegisterAllGCPassesAndPipelines(void);`
	`31`	`+`
`29`	`32`	`#ifdef __cplusplus`
`30`	`33`	`}`
`31`	`34`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`add_subdirectory(Dialect)`
`2`		`-add_subdirectory(Transforms)`
	`2`	`+add_subdirectory(Transforms)`