Skip to content

Commit 52164c4

Browse files
committed
Merge remote-tracking branch 'origin/main' into flash_attention
2 parents 226d0cd + de0376f commit 52164c4

File tree

86 files changed

+5555
-1226
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+5555
-1226
lines changed

.github/dockerfiles/env/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ RUN echo "conda prefix after installing jupyterhub: $(du --human-readable --summ
7474
# install infractl, lit
7575
RUN set -e; \
7676
. ${CONDA_PREFIX}/etc/profile.d/conda.sh; \
77-
conda create --name python-3.9 python=3.9 ipykernel s3cmd dvc; \
77+
conda create --name python-3.9 python=3.9 ipykernel s3cmd dvc numpy; \
7878
conda activate python-3.9; \
7979
pip --no-cache-dir install infractl lit
8080

.github/dockerfiles/env/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
REGISTRY=localhost:5000
4-
TAG=graph-compiler-env:0.0.11
4+
TAG=graph-compiler-env:0.0.13
55

66
set -e
77

.github/dockerfiles/env/entrypoint.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,4 @@ jupyter kernelspec remove -y python3 || true
5858
# Remove lost+found
5959
rm -rf lost+found
6060

61-
find ~/.conda -name '*singleuser'
62-
6361
exec jupyterhub-singleuser

.github/dockerfiles/runner/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@ RUN set -ex; \
1010
curl -sSL https://cli.github.com/packages/githubcli-archive-keyring.gpg > /usr/share/keyrings/githubcli-archive-keyring.gpg; \
1111
apt-get update -y; \
1212
apt-get install -y --no-install-recommends --fix-missing \
13-
python3-pip \
13+
python3-pip python3-dev \
1414
cmake gcc g++ ninja-build git clang-format \
1515
gh \
16+
libomp-dev \
1617
; \
1718
rm -rf /var/lib/apt/lists/*
1819

19-
RUN pip install lit
20+
RUN pip install lit numpy
2021

2122
USER runner
2223
WORKDIR $HOME

.github/dockerfiles/runner/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
REGISTRY=localhost:5000
4-
TAG=graph-compiler-runner:latest
4+
TAG=graph-compiler-runner:0.0.2
55

66
kubectl -n docker-registry port-forward svc/docker-registry 5000:5000 &
77

.github/workflows/build-llvm.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ name: LLVM Build
22

33
on:
44
workflow_dispatch:
5+
push:
6+
paths:
7+
- cmake/llvm-version.txt
8+
- .github/workflows/build-llvm.yml
59

610
permissions: read-all
711

@@ -24,9 +28,10 @@ jobs:
2428

2529
- name: Build
2630
run: |
31+
python3 -m pip install -r mlir/python/requirements.txt
2732
mkdir llvm-install
28-
cmake -G Ninja llvm -B build -DCMAKE_INSTALL_PREFIX=llvm-install \
29-
-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=true -DLLVM_ENABLE_PROJECTS="mlir" -DLLVM_TARGETS_TO_BUILD="X86" -DLLVM_INSTALL_UTILS=true -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DLLVM_INSTALL_GTEST=ON
33+
cmake -G Ninja llvm -B build -DCMAKE_INSTALL_PREFIX=llvm-install -DMLIR_ENABLE_BINDINGS_PYTHON=ON -DPython3_EXECUTABLE=$(which python3) \
34+
-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=true -DLLVM_ENABLE_PROJECTS="mlir" -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="SPIRV" -DLLVM_TARGETS_TO_BUILD="X86" -DLLVM_INSTALL_UTILS=true -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DLLVM_INSTALL_GTEST=ON
3035
cmake --build build --target install
3136
cd llvm-install
3237
tar -zcf ../llvm.tgz .

.github/workflows/build.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: Graph Compiler build
22

33
on:
4+
workflow_dispatch:
45
push:
56
branches:
67
- main
@@ -21,6 +22,22 @@ jobs:
2122
steps:
2223
- uses: actions/checkout@v4
2324

25+
- name: Set LLVM hash
26+
run: |
27+
echo LLVM_HASH=$(cat cmake/llvm-version.txt) >>$GITHUB_ENV
28+
29+
- name: Fetch requirements for python binding
30+
uses: actions/checkout@v4
31+
with:
32+
repository: llvm/llvm-project
33+
ref: ${{ env.LLVM_HASH }}
34+
sparse-checkout: mlir/python/requirements.txt
35+
sparse-checkout-cone-mode: false
36+
path: llvm-dep
37+
38+
- name: Install requirements
39+
run: python3 -m pip install -r llvm-dep/mlir/python/requirements.txt
40+
2441
- name: Build
2542
run: |
2643
scripts/compile.sh

.github/workflows/clang-tidy.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ jobs:
6262
- name: Get changed files
6363
run: |
6464
cd graph-compiler
65-
echo "CHANGED_FILES=$(git diff --name-only $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd' ')" >> $GITHUB_ENV
65+
echo "CHANGED_FILES=$(git diff --name-only --diff-filter=d $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd' ')" >> $GITHUB_ENV
6666
6767
- name: Prepare Environment
6868
shell: bash
@@ -102,4 +102,4 @@ jobs:
102102
shell: bash
103103
run: |
104104
cd build
105-
python3 ../llvm-project/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py -warnings-as-errors=* -p ./ -config-file ../llvm-project/mlir/.clang-tidy -clang-tidy-binary $(which clang-tidy) ${{ env.CHANGED_FILES }}
105+
python3 ../llvm-project/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py -warnings-as-errors=* -p ./ -config-file ../llvm-project/mlir/.clang-tidy -clang-tidy-binary $(which clang-tidy-15) ${{ env.CHANGED_FILES }}

.github/workflows/license.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
2727
- name: Get changed files
2828
run: |
29-
echo "CHANGED_FILES=`git diff --name-only $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd,`" >> $GITHUB_ENV
29+
echo "CHANGED_FILES=`git diff --name-only --diff-filter=d $MERGE_BASE ${{ github.event.pull_request.head.sha }} | paste -sd,`" >> $GITHUB_ENV
3030
3131
- name: Perform license check
3232
run: "python scripts/license.py --files $CHANGED_FILES"

CMakeLists.txt

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ project(GraphCompiler VERSION "0.1.0" LANGUAGES C CXX)
2020

2121
set(CMAKE_CXX_STANDARD 17)
2222
set(CMAKE_CXX_STANDARD_REQUIRED ON)
23+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
2324

2425
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
2526

2627
list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
2728

2829
option(GC_LEGACY_ENABLE ON)
2930
option(GC_TEST_ENABLE "Build the tests" ON)
31+
option(GC_USE_GPU "Enable GPU backend" OFF)
3032
option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
3133
option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
3234

@@ -51,6 +53,13 @@ include(AddLLVM)
5153
include(AddMLIR)
5254
include(HandleLLVMOptions)
5355

56+
if(GC_USE_GPU)
57+
include(imex)
58+
if(GC_DEV_LINK_LLVM_DYLIB)
59+
message(WARN "GPU backend may not be compatible with dynamic linking to LLVM")
60+
endif()
61+
endif()
62+
5463
if(GC_ENABLE_BINDINGS_PYTHON AND NOT MLIR_ENABLE_BINDINGS_PYTHON)
5564
message(STATUS "Failed to enable Python API due to the 'MLIR_ENABLE_BINDINGS_PYTHON' for LLVM is not ON.")
5665
set(GC_ENABLE_BINDINGS_PYTHON OFF CACHE BOOL "" FORCE)
@@ -95,13 +104,12 @@ if(GC_ENABLE_BINDINGS_PYTHON)
95104
endif()
96105

97106
set(GC_LIB_LINKED_LIBS
98-
GCPasses
99-
GCAnalysis
100-
MLIROneDNNGraph
107+
GCJitWrapper
108+
GCCpuRuntime
101109
)
102-
add_library(graph_compiler SHARED ${GC_LIB_SOURCES})
110+
add_mlir_library(graph_compiler SHARED ${GC_LIB_SOURCES})
103111
target_include_directories(graph_compiler PUBLIC ${GC_LIB_INCLUDES})
104-
target_compile_options(graph_compiler PRIVATE -fvisibility=hidden)
112+
target_compile_options(graph_compiler PRIVATE -fvisibility=hidden -fexceptions)
105113
target_link_options(graph_compiler PRIVATE -Wl,--gc-sections)
106114
target_link_libraries(graph_compiler PRIVATE ${GC_LIB_LINKED_LIBS})
107115

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ cmake --build build --target install
3232
```
3333

3434
Notes
35-
* It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
35+
* It is recommended to add optional options `-DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON` to the command `cmake -G Ninja llvm ...` above **if you are building for CPU only**. These will enable the build of LLVM/MLIR dynamic libraries and let MLIR/LLVM tools link to them, to reduce the installed binary size of LLVM/MLIR. These options also enable the `GC_DEV_LINK_LLVM_DYLIB` option of graph-compiler repo (see below).
3636
* The option `-DLLVM_INSTALL_GTEST=ON` is optional, if the tests of graph-compiler are disabled (see `GC_TEST_ENABLE` below).
37+
* If you would like to enable GPU components of Graph Compiler, please make sure to statically link Graph Compiler and LLVM(MLIR). It is a known issue that LLVM shared library cannot be linked together with IGC (Intel's low level GPU compiler). Make sure `LLVM_BUILD_LLVM_DYLIB` and `LLVM_LINK_LLVM_DYLIB` are `OFF` (they are off by default). Also make sure Graph Compiler's cmake option `GC_DEV_LINK_LLVM_DYLIB` is `OFF` when configuring Graph Compiler (see below).
3738

3839
We have now installed LLVM at `llvm-project/llvm-install`.
3940

@@ -58,6 +59,15 @@ Notes:
5859
* `/PATH/TO/llvm-project/llvm-install` should be the install path of LLVM. If you installed LLVM elsewhere by `-DCMAKE_INSTALL_PREFIX` option when building LLVM, you need to change the path in `-DMLIR_DIR` accordingly.
5960
* The cmake option `-DLLVM_EXTERNAL_LIT` is for the tests of this project. It requires the `lit` tool to be installed in the system. You can install it via `pip install lit`. If you don't need to run the tests of this repo, you can omit this option in the command line.
6061

62+
More notes if GPU components are on (`-DGC_USE_GPU=ON`):
63+
* make sure the OpenCL runtime is installed in your system. You can either
64+
install using OS-provided package (Ubuntu 22.04)
65+
```sh
66+
sudo apt install -y intel-opencl-icd opencl-c-headers
67+
```
68+
Or, download and install package from: https://github.com/intel/compute-runtime/releases
69+
* the LLVM codebase needs to be patched to support XeGPU lowering (from IMEX). Please follow instructions of [IMEX](https://github.com/intel/mlir-extensions) on patching LLVM.
70+
6171
Graph Compiler supports the following build-time options.
6272

6373
| CMake Option | Supported values (defaults in bold) | Description |
@@ -66,4 +76,5 @@ Graph Compiler supports the following build-time options.
6676
| GC_TEST_ENABLE | **ON**, OFF | Controls building the tests |
6777
| GC_DEV_LINK_LLVM_DYLIB | ON, **OFF** | Controls dynamic link LLVM/MLIR libraries, mainly for developer |
6878
| GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF | Controls building the Python API |
79+
| GC_USE_GPU | ON, **OFF** | Whether to enable the GPU components |
6980

cmake/functions.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ function(gc_fetch_content
3131
FetchContent_Declare(
3232
${name}
3333
SOURCE_DIR ${GC_${uname}_SRC_DIR}
34-
CMAKE_ARGS ${${uname}_CMAKE_ARGS}
34+
CMAKE_ARGS ${GC_${uname}_CMAKE_ARGS}
3535
)
3636
else ()
3737
if (DEFINED GC_${uname}_VERSION)

cmake/imex.cmake

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
include_guard()
2+
3+
get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
4+
if (NOT DEFINED IMEX_INCLUDES)
5+
include(functions)
6+
set(IMEX_CHECK_LLVM_VERSION ON)
7+
set(IMEX_ENABLE_L0_RUNTIME 0)
8+
# TODO: Change to main https://github.com/oneapi-src/oneDNN.git when all the
9+
# required functionality is merged.
10+
gc_fetch_content(imex 496b240093b5e132b60c5ee69878300fe69be300 https://github.com/Menooker/mlir-extensions
11+
CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=0"
12+
)
13+
14+
set(IMEX_INCLUDES
15+
${imex_BINARY_DIR}/include
16+
${imex_SOURCE_DIR}/include
17+
${imex_SOURCE_DIR}/src
18+
)
19+
set_property(GLOBAL PROPERTY IMEX_INCLUDES ${IMEX_INCLUDES})
20+
endif ()

cmake/llvm-version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
37661a17e26d9002ae9ade8c0de3932c22f16360
1+
89946bda5e1c7ceaf6d26634cc8c8c9498d9f7be

docs/CPU_pipeline_overview.md

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Graph Compiler CPU Compilation Flow Overview
2+
3+
Graph Compiler is an MLIR based end-to-end DL compiler. The entire compilation process is divided into front-end, middle-end and back-end. Different compilation stages will use different combinations of dialects, and together with various transformation passes to perform various optimizations and graph lowering transformations. The entire process will transform IR from hardware-independent abstract expression to hardware-related concrete expression, and finally generate an executable kernel.
4+
5+
Meanwhile, as an MLIR down-stream project, Graph Compiler's implementation not only uses the existing dialects and passes from MLIR up-stream, but also defines new dialects and passes. Most of the new implementations are upstream-able, and we will do so in the future.
6+
7+
The content introduced in this document does not represent the current implemented status, but the target status after the implementation is completed.
8+
9+
### Front-End
10+
11+
The Graph Compiler front-end takes OneDNN Graph dialect as input. oneDNN Graph dialect is a newly defined dialect, which aims to describe the computation graph defined by oneDNN Graph. The ops in Dialect follow the [oneDNN Graph specification](https://oneapi-src.github.io/oneDNN/graph_supported_operations.html).
12+
13+
oneDNN graph dialect example:
14+
15+
```mlir
16+
func.func @mlp(%in: tensor<128x512xbf16>,
17+
%weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
18+
// layer 0
19+
%0 = onednn_graph.matmul %in, %weight0, %bias0 : (tensor<128x512xbf16>, tensor<512x256xbf16>, tensor<256xbf16>) -> tensor<128x256xbf16>
20+
%1 = onednn_graph.relu %0 : (tensor<128x256xbf16>) -> tensor<128x256xbf16>
21+
return %1 : tensor<128x256xbf16>
22+
}
23+
```
24+
25+
There's no planned optimization passe in front-end. The only transformation pass is to lowering OneDNN Graph dialect into Linalg dialect.
26+
27+
### Middle-End
28+
29+
Middle-end is mainly responsible for general optimizations that are independent of the target hardware, and most of the transformations apply to both CPU and GPU. Some of the transformations need to query the target hardware information, such as cache level and capacity. The Hardware abstract layer(HAL) is the interface for abstracting and describing the target hardware information. Therefore, the same pass can generate different optimization results for different hardware under the guidance of HAL.
30+
31+
According to the different dialect combinations used, middle-end is divided into the following stages:
32+
33+
#### Linalg on Tensor
34+
35+
This is the intermediate representation closest to the framework calculation graph. The example IR looks like:
36+
37+
```mlir
38+
func.func @mlp(%in: tensor<128x512xbf16>,
39+
%weight0: tensor<512x256xbf16>, %bias0: tensor<256xbf16>) -> tensor<128x256xbf16> {
40+
%0 = tensor.empty() : tensor<128x256xbf16>
41+
%cst = arith.constant 0.000000e+00 : bf16
42+
%1 = linalg.fill ins(%cst : bf16) outs(%0 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
43+
%2 = linalg.matmul ins(%in, %weight0 : tensor<128x512xbf16>, tensor<512x256xbf16>) outs(%1 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
44+
%3 = tensor.empty() : tensor<128x256xbf16>
45+
%broadcasted = linalg.broadcast ins(%bias0 : tensor<256xbf16>) outs(%3 : tensor<128x256xbf16>) dimensions = [0]
46+
%4 = tensor.empty() : tensor<128x256xbf16>
47+
%5 = linalg.add ins(%2, %broadcasted : tensor<128x256xbf16>, tensor<128x256xbf16>) outs(%4: tensor<128x256xbf16>) -> tensor<128x256xbf16>
48+
%6 = tensor.empty() : tensor<128x256xbf16>
49+
%7 = linalgx.relu ins(%5 : tensor<128x256xbf16>) outs(%6 : tensor<128x256xbf16>) -> tensor<128x256xbf16>
50+
return %7 : tensor<128x256xbf16>
51+
}
52+
```
53+
54+
In this stage, GC will perform some analysis and transformation related to the whole graph. The main transformations include:
55+
56+
* Padding propagation : insert tensor.pad op to adjust tensor shape if the shape is not divisible for target tiling size.
57+
* Layout propagation : insert tensor.pack and tensor.unpack to adjust tensor layout if blocking layout is preferred.
58+
* Tensor constant propagation : identify folding with constant tensor and build folding block.
59+
* Matmul lowering : lower Linalg.matmul into scf.forall with linalg.batch_reduce_matmul.
60+
* Fine-grain fusion: fuse element-wise/broadcast/reduce/movement ops into base op(e.g. matmul).
61+
* Lower linalg to arith/math on virtual vector : lower Linalg to Arith/Math and tiling tensor to virtual vector.
62+
63+
### Tensor and scf loop with arith/math on virtual vector
64+
65+
In this stage, most of the Linalg ops are lowered to Scf loops with Arith and Math ops. Both Arith and Math ops use tile tensor as input and output. The tile tensor here can be multi-dimensional tensor in any shape, regardless of the hardware register width. The tile size is chosen based on L1 cache capacity, that is, it is a good abstraction to partition the problem size to this granularity, since the microkernel, pre-op, and post-op, works at the tensor size fitting within l1 cache size. Meanwhile, converting Linalg into Arith and Math can further expose the implementation details of Linalg ops, which allow us to further simplify the computation after fusion.
66+
67+
IR example:
68+
69+
```mlir
70+
func.func @add_tensor(%arg0: tensor<4x8x31xf32>, %arg1: tensor<4x8x31xf32>) -> tensor<4x8x31xf32> {
71+
%0 = tensor.empty() : tensor<4x8x31xf32>
72+
%init = arith.constant 0: index
73+
%c1 = arith.constant 1: index
74+
%first_dim = arith.constant 4: index
75+
%second_dim = arith.constant 8: index
76+
// assume our tile shape is [31]
77+
%third_dim = arith.constant 31: index
78+
scf.for %c5 = %init to %first_dim step %c1 {
79+
scf.for %c6 = %init to %second_dim step %c1 {
80+
scf.for %c7 = %init to %third_dim step %c1 {
81+
%1 = vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
82+
%2 = vector.transfer_read %args0[%c5,%c6,%c7] {permutation_map = affine_map<() -> ()>} : tensor<31xf32>, vector<31xf32>
83+
%3 = arith.add %1, %2 : vector<31xf32>
84+
vector.transfer_write %3, %0[%c5, %c6, %c7] : vector<31xf32>, tensor<31xf32>
85+
}
86+
}
87+
}
88+
return %0: tensor<4x8x31xf32>
89+
}
90+
```
91+
92+
The main transformations in this stage include:
93+
* Bfloat16 promotion and cast eliminatation : legalize the Arith and Math ops by inserting `arith.extf` and `arith.truncf` pairs if target device doesn't support, remove pair of redundant `arith.extf` and `arith.truncf` pairs to improve performance and accuracy.
94+
* Lower to physical vector : Lower virtual vector to physical vector based on physical register width of target device.
95+
96+
### Backend-End
97+
98+
Back-end is responsible for device dependent optimization. The use of dialect will vary with the target device. This document will focus on the backend implementation for CPU.
99+
100+
The implementation of BRGEMM is the key to CPU performance.In GC we plan to introduce two different implementations:
101+
102+
* The BRGEMM provided by the library, such as onednn. In order to better abstract and describe the kernel provided by the library, we introduced the microkernel dialect.
103+
104+
* The BRGEMM generated by MLIR. In this approach, The AMX dialect will be used to simplify tile config processing and optimization.
105+
106+
By default GC will use openmp dialect to handle task parallelism. But for better performance and support for non-openmp threadpools, we also introduced the CPURuntime dialect. This dialect also introduces some runtime function calls specifically designed for the CPU, such as thread-local memory allocator, which can improve performance on the CPU.
107+
108+
The main transformations are:
109+
* Memref lowering and scheduling : lower tensor dialect to memref dialect and perform memory related optimization including memory hoist and rescheduling.
110+
* Microkernel dialect and lowering : lower linalg.batch_reduce_matmul to microkernel dialect and further lower to a function call to dnnl brgemm, or an MLIR-based brgemm implementation.
111+
* Parallelcpu dialect and lowering : lower to parallelcpu dialect for Nested parallel loop support and other CPU runtime calls.
112+
113+
In the last step, everything will lower to LLVM dialect. We don't plan to introduce any transformation on LLVM dialect, just leverage the upstream implementation for this.

docs/dialect_overview.png

83.1 KB
Loading

include/gc-c/Passes.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ extern "C" {
2626

2727
#include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.capi.h.inc"
2828
#include "gc/Transforms/Passes.capi.h.inc"
29+
30+
MLIR_CAPI_EXPORTED void mlirRegisterAllGCPassesAndPipelines(void);
31+
2932
#ifdef __cplusplus
3033
}
3134
#endif

include/gc/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
add_subdirectory(Dialect)
2-
add_subdirectory(Transforms)
2+
add_subdirectory(Transforms)

include/gc/Dialect/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
add_subdirectory(CPURuntime)
22
add_subdirectory(OneDNNGraph)
33
add_subdirectory(Microkernel)
4-
add_subdirectory(Linalgx)
4+
add_subdirectory(Linalgx)

0 commit comments

Comments
 (0)