Skip to content

Commit 43fd244

Browse files
committed
Reland "[AMDGPU] Add AMDGPU-specific module splitting (#89245)"
(with fix for ubsan) This enables the --lto-partitions option to work more consistently. This module splitting logic is fully aware of AMDGPU modules and their specificities and takes advantage of them to split modules in a way that avoids compilation issue (such as resource usage being incorrectly represented). This also includes a logging system that's more elaborate than just LLVM_DEBUG which allows printing logs to uniquely named files, and optionally with all value names hidden so they can be safely shared without leaking informatiton about the source. Logs can also be enabled through an environment variable, which avoids the sometimes complicated process of passing a -mllvm option all the way from clang driver to the offload linker that handles full LTO codegen.
1 parent 2b78c64 commit 43fd244

21 files changed

+1560
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp

Lines changed: 744 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H
12+
#define LLVM_TARGET_AMDGPUSPLITMODULE_H
13+
14+
#include "llvm/ADT/STLFunctionalExtras.h"
15+
#include <memory>
16+
17+
namespace llvm {
18+
19+
class Module;
20+
class AMDGPUTargetMachine;
21+
22+
/// Splits the module M into N linkable partitions. The function ModuleCallback
23+
/// is called N times passing each individual partition as the MPart argument.
24+
void splitAMDGPUModule(
25+
const AMDGPUTargetMachine &TM, Module &M, unsigned N,
26+
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);
27+
28+
} // end namespace llvm
29+
30+
#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "AMDGPUIGroupLP.h"
2222
#include "AMDGPUMacroFusion.h"
2323
#include "AMDGPURegBankSelect.h"
24+
#include "AMDGPUSplitModule.h"
2425
#include "AMDGPUTargetObjectFile.h"
2526
#include "AMDGPUTargetTransformInfo.h"
2627
#include "AMDGPUUnifyDivergentExitNodes.h"
@@ -815,6 +816,13 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
815816
return AMDGPUAS::FLAT_ADDRESS;
816817
}
817818

819+
bool AMDGPUTargetMachine::splitModule(
820+
Module &M, unsigned NumParts,
821+
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
822+
splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
823+
return true;
824+
}
825+
818826
//===----------------------------------------------------------------------===//
819827
// GCN Target Machine (SI+)
820828
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
7373
getPredicatedAddrSpace(const Value *V) const override;
7474

7575
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;
76+
77+
bool splitModule(Module &M, unsigned NumParts,
78+
function_ref<void(std::unique_ptr<Module> MPart)>
79+
ModuleCallback) const override;
7680
};
7781

7882
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ add_llvm_target(AMDGPUCodeGen
9898
AMDGPURewriteOutArguments.cpp
9999
AMDGPURewriteUndefForPHI.cpp
100100
AMDGPUSetWavePriority.cpp
101+
AMDGPUSplitModule.cpp
101102
AMDGPUSubtarget.cpp
102103
AMDGPUTargetMachine.cpp
103104
AMDGPUTargetObjectFile.cpp
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels:
7+
; - A does a direct call to HelperA
8+
; - B is storing @HelperA
9+
; - C does a direct call to HelperA
10+
;
11+
; The helper functions will get externalized, which will force A and C into P0 as
12+
; external functions cannot be duplicated.
13+
14+
; CHECK0: define hidden void @HelperA()
15+
; CHECK0: define amdgpu_kernel void @A()
16+
; CHECK0: declare amdgpu_kernel void @B(ptr)
17+
; CHECK0: define amdgpu_kernel void @C()
18+
19+
; CHECK1: declare hidden void @HelperA()
20+
; CHECK1: declare amdgpu_kernel void @A()
21+
; CHECK1: declare amdgpu_kernel void @B(ptr)
22+
; CHECK1: declare amdgpu_kernel void @C()
23+
24+
; CHECK2: declare hidden void @HelperA()
25+
; CHECK2: declare amdgpu_kernel void @A()
26+
; CHECK2: define amdgpu_kernel void @B(ptr %dst)
27+
; CHECK2: declare amdgpu_kernel void @C()
28+
29+
define internal void @HelperA() {
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @A() {
34+
call void @HelperA()
35+
ret void
36+
}
37+
38+
define amdgpu_kernel void @B(ptr %dst) {
39+
store ptr @HelperA, ptr %dst
40+
ret void
41+
}
42+
43+
define amdgpu_kernel void @C() {
44+
call void @HelperA()
45+
ret void
46+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
5+
; 2 kernels:
6+
; - A is isolated
7+
; - B is storing @HelperA/B's address
8+
;
9+
; The helper functions should get externalized (become hidden w/ external linkage)
10+
11+
; CHECK0: define hidden void @HelperA()
12+
; CHECK0: define hidden void @HelperB()
13+
; CHECK0: define amdgpu_kernel void @A()
14+
; CHECK0: declare amdgpu_kernel void @B(i1, ptr)
15+
16+
; CHECK1: declare hidden void @HelperA()
17+
; CHECK1: declare hidden void @HelperB()
18+
; CHECK1: declare amdgpu_kernel void @A()
19+
; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst)
20+
21+
define internal void @HelperA() {
22+
ret void
23+
}
24+
25+
define internal void @HelperB() {
26+
ret void
27+
}
28+
29+
define amdgpu_kernel void @A() {
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @B(i1 %cond, ptr %dst) {
34+
%addr = select i1 %cond, ptr @HelperA, ptr @HelperB
35+
store ptr %addr, ptr %dst
36+
ret void
37+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
2+
; REQUIRES: asserts
3+
4+
; SHA256 of the kernel names.
5+
6+
; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
7+
; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
8+
; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55
9+
10+
define amdgpu_kernel void @MyCustomKernel0() {
11+
ret void
12+
}
13+
14+
define amdgpu_kernel void @MyCustomKernel1() {
15+
ret void
16+
}
17+
18+
define amdgpu_kernel void @MyCustomKernel2() {
19+
ret void
20+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
5+
; 3 kernels:
6+
; - A calls nothing
7+
; - B calls @PerryThePlatypus
8+
; - C calls @Perry, an alias of @PerryThePlatypus
9+
;
10+
; We should see through the alias and put B/C in the same
11+
; partition.
12+
;
13+
; Additionally, @PerryThePlatypus gets externalized as
14+
; the alias counts as taking its address.
15+
16+
; CHECK0-NOT: define
17+
; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
18+
; CHECK0: define hidden void @PerryThePlatypus()
19+
; CHECK0: define amdgpu_kernel void @B
20+
; CHECK0: define amdgpu_kernel void @C
21+
; CHECK0-NOT: define
22+
23+
; CHECK1-NOT: define
24+
; CHECK1: define amdgpu_kernel void @A
25+
; CHECK1-NOT: define
26+
27+
@Perry = internal alias ptr(), ptr @PerryThePlatypus
28+
29+
define internal void @PerryThePlatypus() {
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @A() {
34+
ret void
35+
}
36+
37+
define amdgpu_kernel void @B() {
38+
call void @PerryThePlatypus()
39+
ret void
40+
}
41+
42+
define amdgpu_kernel void @C() {
43+
call void @Perry()
44+
ret void
45+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels with each their own dependencies should go into 3
7+
; distinct partitions. The most expensive kernel should be
8+
; seen first and go into the last partition.
9+
10+
; CHECK0-NOT: define
11+
; CHECK0: define amdgpu_kernel void @C
12+
; CHECK0: define internal void @HelperC
13+
; CHECK0-NOT: define
14+
15+
; CHECK1-NOT: define
16+
; CHECK1: define amdgpu_kernel void @A
17+
; CHECK1: define internal void @HelperA
18+
; CHECK1-NOT: define
19+
20+
; CHECK2-NOT: define
21+
; CHECK2: define amdgpu_kernel void @B
22+
; CHECK2: define internal void @HelperB
23+
; CHECK2-NOT: define
24+
25+
26+
define amdgpu_kernel void @A() {
27+
call void @HelperA()
28+
ret void
29+
}
30+
31+
define internal void @HelperA() {
32+
ret void
33+
}
34+
35+
define amdgpu_kernel void @B(ptr %x) {
36+
store i64 42, ptr %x
37+
store i64 43, ptr %x
38+
store i64 44, ptr %x
39+
call void @HelperB()
40+
ret void
41+
}
42+
43+
define internal void @HelperB() {
44+
ret void
45+
}
46+
47+
define amdgpu_kernel void @C() {
48+
call void @HelperC()
49+
ret void
50+
}
51+
52+
define internal void @HelperC() {
53+
ret void
54+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels with each their own dependencies should go into 3
7+
; distinct partitions.
8+
9+
; CHECK0-NOT: define
10+
; CHECK0: define amdgpu_kernel void @C
11+
; CHECK0: define internal void @HelperC
12+
; CHECK0-NOT: define
13+
14+
; CHECK1-NOT: define
15+
; CHECK1: define amdgpu_kernel void @B
16+
; CHECK1: define internal void @HelperB
17+
; CHECK1-NOT: define
18+
19+
; CHECK2-NOT: define
20+
; CHECK2: define amdgpu_kernel void @A
21+
; CHECK2: define internal void @HelperA
22+
; CHECK2-NOT: define
23+
24+
25+
define amdgpu_kernel void @A() {
26+
call void @HelperA()
27+
ret void
28+
}
29+
30+
define internal void @HelperA() {
31+
ret void
32+
}
33+
34+
define amdgpu_kernel void @B() {
35+
call void @HelperB()
36+
ret void
37+
}
38+
39+
define internal void @HelperB() {
40+
ret void
41+
}
42+
43+
define amdgpu_kernel void @C() {
44+
call void @HelperC()
45+
ret void
46+
}
47+
48+
define internal void @HelperC() {
49+
ret void
50+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
2+
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
3+
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
4+
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s
5+
6+
; 3 kernels share a common helper, that helper should be
7+
; cloned in all partitions.
8+
9+
; CHECK0-NOT: define
10+
; CHECK0: define internal void @Helper
11+
; CHECK0: define amdgpu_kernel void @C
12+
; CHECK0-NOT: define
13+
14+
; CHECK1-NOT: define
15+
; CHECK1: define internal void @Helper
16+
; CHECK1: define amdgpu_kernel void @B
17+
; CHECK1-NOT: define
18+
19+
; CHECK2-NOT: define
20+
; CHECK2: define internal void @Helper
21+
; CHECK2: define amdgpu_kernel void @A
22+
; CHECK2-NOT: define
23+
24+
define internal void @Helper() {
25+
ret void
26+
}
27+
28+
define amdgpu_kernel void @A() {
29+
call void @Helper()
30+
ret void
31+
}
32+
33+
define amdgpu_kernel void @B() {
34+
call void @Helper()
35+
ret void
36+
}
37+
38+
define amdgpu_kernel void @C() {
39+
call void @Helper()
40+
ret void
41+
}

0 commit comments

Comments
 (0)