Skip to content

Commit b1bcb7c

Browse files
committed
Reapply "AMDGPU: Move attributor into optimization pipeline (#83131)" and follow up commit "clang/AMDGPU: Defeat attribute optimization in attribute test" (#98851)
This reverts commit adaff46. Drop the -O3 checks from default-attributes.hip. I don't know why they are different on some bots but reverting this is far too disruptive.
1 parent 71051de commit b1bcb7c

File tree

562 files changed

+86435
-90198
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

562 files changed

+86435
-90198
lines changed

clang/test/CodeGenHIP/default-attributes.hip

Lines changed: 19 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,55 +2,44 @@
22
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \
33
// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPTNONE %s
44

5-
// RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \
6-
// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s
7-
85
#define __device__ __attribute__((device))
96
#define __global__ __attribute__((global))
107

8+
//.
9+
// OPTNONE: @__hip_cuid_ = addrspace(1) global i8 0
10+
// OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
11+
// OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
12+
//.
13+
__device__ void extern_func();
14+
1115
// OPTNONE: Function Attrs: convergent mustprogress noinline nounwind optnone
1216
// OPTNONE-LABEL: define {{[^@]+}}@_Z4funcv
1317
// OPTNONE-SAME: () #[[ATTR0:[0-9]+]] {
1418
// OPTNONE-NEXT: entry:
19+
// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3:[0-9]+]]
1520
// OPTNONE-NEXT: ret void
1621
//
17-
// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
18-
// OPT-LABEL: define {{[^@]+}}@_Z4funcv
19-
// OPT-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
20-
// OPT-NEXT: entry:
21-
// OPT-NEXT: ret void
22-
//
2322
__device__ void func() {
24-
23+
extern_func();
2524
}
2625

2726
// OPTNONE: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
2827
// OPTNONE-LABEL: define {{[^@]+}}@_Z6kernelv
29-
// OPTNONE-SAME: () #[[ATTR1:[0-9]+]] {
28+
// OPTNONE-SAME: () #[[ATTR2:[0-9]+]] {
3029
// OPTNONE-NEXT: entry:
30+
// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3]]
3131
// OPTNONE-NEXT: ret void
3232
//
33-
// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
34-
// OPT-LABEL: define {{[^@]+}}@_Z6kernelv
35-
// OPT-SAME: () local_unnamed_addr #[[ATTR1:[0-9]+]] {
36-
// OPT-NEXT: entry:
37-
// OPT-NEXT: ret void
38-
//
3933
__global__ void kernel() {
40-
34+
extern_func();
4135
}
4236
//.
43-
// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
44-
// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
45-
//.
46-
// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
47-
// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
48-
//.
49-
// OPTNONE: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
50-
// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
51-
// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4}
37+
// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
38+
// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
39+
// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
40+
// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
5241
//.
53-
// OPT: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
54-
// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
55-
// OPT: !2 = !{i32 1, !"wchar_size", i32 4}
42+
// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
43+
// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
44+
// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
5645
//.

llvm/docs/ReleaseNotes.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,10 @@ Changes to the AMDGPU Backend
139139
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
140140
`fmax` with addrspace(3) instead.
141141

142+
* AMDGPUAttributor is no longer run as part of the codegen pass
143+
pipeline. It is expected to run as part of the middle end
144+
optimizations.
145+
142146
Changes to the ARM Backend
143147
--------------------------
144148

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
731731
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
732732
});
733733

734+
// FIXME: Why is AMDGPUAttributor not in CGSCC?
735+
PB.registerOptimizerLastEPCallback(
736+
[this](ModulePassManager &MPM, OptimizationLevel Level) {
737+
if (Level != OptimizationLevel::O0) {
738+
MPM.addPass(AMDGPUAttributorPass(*this));
739+
}
740+
});
741+
734742
PB.registerFullLinkTimeOptimizationLastEPCallback(
735743
[this](ModulePassManager &PM, OptimizationLevel Level) {
736744
// We want to support the -lto-partitions=N option as "best effort".
@@ -1037,11 +1045,6 @@ void AMDGPUPassConfig::addIRPasses() {
10371045
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
10381046
}
10391047

1040-
// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
1041-
// after their introduction
1042-
if (TM.getOptLevel() > CodeGenOptLevel::None)
1043-
addPass(createAMDGPUAttributorLegacyPass());
1044-
10451048
if (TM.getOptLevel() > CodeGenOptLevel::None)
10461049
addPass(createInferAddressSpacesPass());
10471050

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
679679
break;
680680
}
681681
}
682+
683+
// FIXME: We can spill incoming arguments and restore at the end of the
684+
// prolog.
685+
if (!ScratchWaveOffsetReg)
686+
report_fatal_error(
687+
"could not find temporary scratch offset register in prolog");
682688
} else {
683689
ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
684690
}

llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
66
; GFX11-LABEL: s_add_u64:
77
; GFX11: ; %bb.0: ; %entry
88
; GFX11-NEXT: s_clause 0x1
9-
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
10-
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
9+
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
10+
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
1111
; GFX11-NEXT: v_mov_b32_e32 v2, 0
1212
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1313
; GFX11-NEXT: s_add_u32 s0, s6, s0
@@ -22,8 +22,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
2222
; GFX12-LABEL: s_add_u64:
2323
; GFX12: ; %bb.0: ; %entry
2424
; GFX12-NEXT: s_clause 0x1
25-
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
26-
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
25+
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
26+
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
2727
; GFX12-NEXT: v_mov_b32_e32 v2, 0
2828
; GFX12-NEXT: s_wait_kmcnt 0x0
2929
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
@@ -58,8 +58,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
5858
; GFX11-LABEL: s_sub_u64:
5959
; GFX11: ; %bb.0: ; %entry
6060
; GFX11-NEXT: s_clause 0x1
61-
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
62-
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
61+
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
62+
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
6363
; GFX11-NEXT: v_mov_b32_e32 v2, 0
6464
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
6565
; GFX11-NEXT: s_sub_u32 s0, s6, s0
@@ -74,8 +74,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
7474
; GFX12-LABEL: s_sub_u64:
7575
; GFX12: ; %bb.0: ; %entry
7676
; GFX12-NEXT: s_clause 0x1
77-
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
78-
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
77+
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
78+
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
7979
; GFX12-NEXT: v_mov_b32_e32 v2, 0
8080
; GFX12-NEXT: s_wait_kmcnt 0x0
8181
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]

0 commit comments

Comments
 (0)