Skip to content

Commit 652bcf6

Browse files
authored
CodeGenPrepare: Add support for llvm.threadlocal.address address-mode sinking (#87844)
Depending on the TLSMode many thread-local accesses on x86 can be expressed by adding a %fs: segment register to an addressing mode. Even if there are mutliple users of a `llvm.threadlocal.address` intrinsic it is generally not worth sharing the value in a register but instead fold the %fs access into multiple addressing modes. Hence this changes CodeGenPrepare to duplicate the `llvm.threadlocal.address` intrinsic as necessary. Introduces a new `TargetLowering::addressingModeSupportsTLS` callback that allows targets to indicate whether TLS accesses can be part of an addressing mode. This is fixing a performance problem, as this folding of TLS-accesses into multiple addressing modes happened naturally before the introduction of the `llvm.threadlocal.address` intrinsic, but regressed due to `SelectionDAG` keeping things in registers when accessed across basic blocks, so CodeGenPrepare needs to duplicate to mitigate this. We see a ~0.5% recovery in a codebase with heavy TLS usage (HHVM). This fixes most of #87437
1 parent db2f64e commit 652bcf6

File tree

6 files changed

+336
-4
lines changed

6 files changed

+336
-4
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2805,6 +2805,12 @@ class TargetLoweringBase {
28052805
Type *Ty, unsigned AddrSpace,
28062806
Instruction *I = nullptr) const;
28072807

2808+
/// Returns true if the targets addressing mode can target thread local
2809+
/// storage (TLS).
2810+
virtual bool addressingModeSupportsTLS(const GlobalValue &) const {
2811+
return false;
2812+
}
2813+
28082814
/// Return the prefered common base offset.
28092815
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
28102816
int64_t MaxOffset) const {

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5082,6 +5082,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
50825082
}
50835083
return true;
50845084
}
5085+
case Instruction::Call:
5086+
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {
5087+
if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
5088+
GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));
5089+
if (TLI.addressingModeSupportsTLS(GV))
5090+
return matchAddr(AddrInst->getOperand(0), Depth);
5091+
}
5092+
}
5093+
break;
50855094
}
50865095
return false;
50875096
}
@@ -5620,11 +5629,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
56205629
return Modified;
56215630
}
56225631

5623-
if (AddrMode.BaseGV) {
5632+
GlobalValue *BaseGV = AddrMode.BaseGV;
5633+
if (BaseGV != nullptr) {
56245634
if (ResultPtr)
56255635
return Modified;
56265636

5627-
ResultPtr = AddrMode.BaseGV;
5637+
if (BaseGV->isThreadLocal()) {
5638+
ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);
5639+
} else {
5640+
ResultPtr = BaseGV;
5641+
}
56285642
}
56295643

56305644
// If the real base value actually came from an inttoptr, then the matcher
@@ -5789,8 +5803,15 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
57895803
}
57905804

57915805
// Add in the BaseGV if present.
5792-
if (AddrMode.BaseGV) {
5793-
Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
5806+
GlobalValue *BaseGV = AddrMode.BaseGV;
5807+
if (BaseGV != nullptr) {
5808+
Value *BaseGVPtr;
5809+
if (BaseGV->isThreadLocal()) {
5810+
BaseGVPtr = Builder.CreateThreadLocalAddress(BaseGV);
5811+
} else {
5812+
BaseGVPtr = BaseGV;
5813+
}
5814+
Value *V = Builder.CreatePtrToInt(BaseGVPtr, IntPtrTy, "sunkaddr");
57945815
if (Result)
57955816
Result = Builder.CreateAdd(Result, V, "sunkaddr");
57965817
else

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18920,6 +18920,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
1892018920
llvm_unreachable("TLS not implemented for this target.");
1892118921
}
1892218922

18923+
bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {
18924+
if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18925+
const TargetMachine &TM = getTargetMachine();
18926+
TLSModel::Model Model = TM.getTLSModel(&GV);
18927+
switch (Model) {
18928+
case TLSModel::LocalExec:
18929+
case TLSModel::InitialExec:
18930+
// We can include the %fs segment register in addressing modes.
18931+
return true;
18932+
case TLSModel::LocalDynamic:
18933+
case TLSModel::GeneralDynamic:
18934+
// These models do not result in %fs relative addresses unless
18935+
// TLS descriptior are used.
18936+
//
18937+
// Even in the case of TLS descriptors we currently have no way to model
18938+
// the difference between %fs access and the computations needed for the
18939+
// offset and returning `true` for TLS-desc currently duplicates both
18940+
// which is detrimental :-/
18941+
return false;
18942+
}
18943+
}
18944+
return false;
18945+
}
18946+
1892318947
/// Lower SRA_PARTS and friends, which return two i32 values
1892418948
/// and take a 2 x i32 value to shift plus a shift amount.
1892518949
/// TODO: Can this be moved to general expansion code?

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,6 +1323,8 @@ namespace llvm {
13231323
Type *Ty, unsigned AS,
13241324
Instruction *I = nullptr) const override;
13251325

1326+
bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
1327+
13261328
/// Return true if the specified immediate is legal
13271329
/// icmp immediate, that is the target has icmp instructions which can
13281330
/// compare a register against the immediate without having to materialize
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -o - %s | FileCheck %s --check-prefix=NOPIC
3+
; RUN: llc -o - %s -relocation-model=pic | FileCheck %s --check-prefix=PIC
4+
; RUN: llc -o - %s -relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=TLSDESC
5+
6+
target triple = "x86_64--linux-gnu"
7+
8+
declare void @effect()
9+
declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
10+
11+
@foo_local = dso_local thread_local(localexec) global i32 0, align 4
12+
13+
define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind {
14+
; NOPIC-LABEL: func_local_tls:
15+
; NOPIC: # %bb.0: # %entry
16+
; NOPIC-NEXT: pushq %rbp
17+
; NOPIC-NEXT: pushq %rbx
18+
; NOPIC-NEXT: pushq %rax
19+
; NOPIC-NEXT: movl %fs:foo_local@TPOFF, %ebp
20+
; NOPIC-NEXT: testl %edi, %edi
21+
; NOPIC-NEXT: movl %ebp, %eax
22+
; NOPIC-NEXT: jne .LBB0_2
23+
; NOPIC-NEXT: # %bb.1: # %if.then
24+
; NOPIC-NEXT: movq %rsi, %rbx
25+
; NOPIC-NEXT: callq effect@PLT
26+
; NOPIC-NEXT: movl %fs:foo_local@TPOFF+168(,%rbx,4), %eax
27+
; NOPIC-NEXT: .LBB0_2: # %if.end
28+
; NOPIC-NEXT: addl %ebp, %eax
29+
; NOPIC-NEXT: addq $8, %rsp
30+
; NOPIC-NEXT: popq %rbx
31+
; NOPIC-NEXT: popq %rbp
32+
; NOPIC-NEXT: retq
33+
;
34+
; PIC-LABEL: func_local_tls:
35+
; PIC: # %bb.0: # %entry
36+
; PIC-NEXT: pushq %rbp
37+
; PIC-NEXT: pushq %r14
38+
; PIC-NEXT: pushq %rbx
39+
; PIC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebp
40+
; PIC-NEXT: testl %edi, %edi
41+
; PIC-NEXT: movl %ebp, %eax
42+
; PIC-NEXT: jne .LBB0_2
43+
; PIC-NEXT: # %bb.1: # %if.then
44+
; PIC-NEXT: movq %rsi, %rbx
45+
; PIC-NEXT: movq %fs:0, %rax
46+
; PIC-NEXT: leaq .Lfoo_local$local@TPOFF(%rax), %r14
47+
; PIC-NEXT: callq effect@PLT
48+
; PIC-NEXT: movl 168(%r14,%rbx,4), %eax
49+
; PIC-NEXT: .LBB0_2: # %if.end
50+
; PIC-NEXT: addl %ebp, %eax
51+
; PIC-NEXT: popq %rbx
52+
; PIC-NEXT: popq %r14
53+
; PIC-NEXT: popq %rbp
54+
; PIC-NEXT: retq
55+
;
56+
; TLSDESC-LABEL: func_local_tls:
57+
; TLSDESC: # %bb.0: # %entry
58+
; TLSDESC-NEXT: pushq %rbp
59+
; TLSDESC-NEXT: pushq %r14
60+
; TLSDESC-NEXT: pushq %rbx
61+
; TLSDESC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebp
62+
; TLSDESC-NEXT: testl %edi, %edi
63+
; TLSDESC-NEXT: movl %ebp, %eax
64+
; TLSDESC-NEXT: jne .LBB0_2
65+
; TLSDESC-NEXT: # %bb.1: # %if.then
66+
; TLSDESC-NEXT: movq %rsi, %rbx
67+
; TLSDESC-NEXT: movq %fs:0, %rax
68+
; TLSDESC-NEXT: leaq .Lfoo_local$local@TPOFF(%rax), %r14
69+
; TLSDESC-NEXT: callq effect@PLT
70+
; TLSDESC-NEXT: movl 168(%r14,%rbx,4), %eax
71+
; TLSDESC-NEXT: .LBB0_2: # %if.end
72+
; TLSDESC-NEXT: addl %ebp, %eax
73+
; TLSDESC-NEXT: popq %rbx
74+
; TLSDESC-NEXT: popq %r14
75+
; TLSDESC-NEXT: popq %rbp
76+
; TLSDESC-NEXT: retq
77+
entry:
78+
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local)
79+
%load0 = load i32, ptr %addr, align 4
80+
%cond = icmp eq i32 %arg0, 0
81+
br i1 %cond, label %if.then, label %if.end
82+
83+
if.then:
84+
tail call void @effect()
85+
%x = add i64 %arg1, 42
86+
%addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
87+
%load1 = load i32, ptr %addr1, align 4
88+
br label %if.end
89+
90+
if.end:
91+
%phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
92+
%ret = add i32 %phi, %load0
93+
ret i32 %ret
94+
}
95+
96+
@foo_nonlocal = thread_local global i32 0, align 4
97+
98+
define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind {
99+
; NOPIC-LABEL: func_nonlocal_tls:
100+
; NOPIC: # %bb.0: # %entry
101+
; NOPIC-NEXT: pushq %rbp
102+
; NOPIC-NEXT: pushq %r14
103+
; NOPIC-NEXT: pushq %rbx
104+
; NOPIC-NEXT: movq foo_nonlocal@GOTTPOFF(%rip), %r14
105+
; NOPIC-NEXT: movl %fs:(%r14), %ebp
106+
; NOPIC-NEXT: testl %edi, %edi
107+
; NOPIC-NEXT: movl %ebp, %eax
108+
; NOPIC-NEXT: jne .LBB1_2
109+
; NOPIC-NEXT: # %bb.1: # %if.then
110+
; NOPIC-NEXT: movq %rsi, %rbx
111+
; NOPIC-NEXT: callq effect@PLT
112+
; NOPIC-NEXT: movl %fs:168(%r14,%rbx,4), %eax
113+
; NOPIC-NEXT: .LBB1_2: # %if.end
114+
; NOPIC-NEXT: addl %ebp, %eax
115+
; NOPIC-NEXT: popq %rbx
116+
; NOPIC-NEXT: popq %r14
117+
; NOPIC-NEXT: popq %rbp
118+
; NOPIC-NEXT: retq
119+
;
120+
; PIC-LABEL: func_nonlocal_tls:
121+
; PIC: # %bb.0: # %entry
122+
; PIC-NEXT: pushq %rbp
123+
; PIC-NEXT: pushq %r15
124+
; PIC-NEXT: pushq %r14
125+
; PIC-NEXT: pushq %rbx
126+
; PIC-NEXT: pushq %rax
127+
; PIC-NEXT: movq %rsi, %rbx
128+
; PIC-NEXT: movl %edi, %ebp
129+
; PIC-NEXT: data16
130+
; PIC-NEXT: leaq foo_nonlocal@TLSGD(%rip), %rdi
131+
; PIC-NEXT: data16
132+
; PIC-NEXT: data16
133+
; PIC-NEXT: rex64
134+
; PIC-NEXT: callq __tls_get_addr@PLT
135+
; PIC-NEXT: movq %rax, %r14
136+
; PIC-NEXT: movl (%rax), %r15d
137+
; PIC-NEXT: testl %ebp, %ebp
138+
; PIC-NEXT: movl %r15d, %eax
139+
; PIC-NEXT: jne .LBB1_2
140+
; PIC-NEXT: # %bb.1: # %if.then
141+
; PIC-NEXT: callq effect@PLT
142+
; PIC-NEXT: movl 168(%r14,%rbx,4), %eax
143+
; PIC-NEXT: .LBB1_2: # %if.end
144+
; PIC-NEXT: addl %r15d, %eax
145+
; PIC-NEXT: addq $8, %rsp
146+
; PIC-NEXT: popq %rbx
147+
; PIC-NEXT: popq %r14
148+
; PIC-NEXT: popq %r15
149+
; PIC-NEXT: popq %rbp
150+
; PIC-NEXT: retq
151+
;
152+
; TLSDESC-LABEL: func_nonlocal_tls:
153+
; TLSDESC: # %bb.0: # %entry
154+
; TLSDESC-NEXT: pushq %rbp
155+
; TLSDESC-NEXT: pushq %r14
156+
; TLSDESC-NEXT: pushq %rbx
157+
; TLSDESC-NEXT: leaq foo_nonlocal@tlsdesc(%rip), %rax
158+
; TLSDESC-NEXT: callq *foo_nonlocal@tlscall(%rax)
159+
; TLSDESC-NEXT: movl %fs:(%rax), %ebp
160+
; TLSDESC-NEXT: testl %edi, %edi
161+
; TLSDESC-NEXT: movl %ebp, %ecx
162+
; TLSDESC-NEXT: jne .LBB1_2
163+
; TLSDESC-NEXT: # %bb.1: # %if.then
164+
; TLSDESC-NEXT: movq %rsi, %rbx
165+
; TLSDESC-NEXT: addq %fs:0, %rax
166+
; TLSDESC-NEXT: movq %rax, %r14
167+
; TLSDESC-NEXT: callq effect@PLT
168+
; TLSDESC-NEXT: movl 168(%r14,%rbx,4), %ecx
169+
; TLSDESC-NEXT: .LBB1_2: # %if.end
170+
; TLSDESC-NEXT: addl %ebp, %ecx
171+
; TLSDESC-NEXT: movl %ecx, %eax
172+
; TLSDESC-NEXT: popq %rbx
173+
; TLSDESC-NEXT: popq %r14
174+
; TLSDESC-NEXT: popq %rbp
175+
; TLSDESC-NEXT: retq
176+
entry:
177+
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_nonlocal)
178+
%load0 = load i32, ptr %addr, align 4
179+
%cond = icmp eq i32 %arg0, 0
180+
br i1 %cond, label %if.then, label %if.end
181+
182+
if.then:
183+
tail call void @effect()
184+
%x = add i64 %arg1, 42
185+
%addr1 = getelementptr inbounds i32, ptr %addr, i64 %x
186+
%load1 = load i32, ptr %addr1, align 4
187+
br label %if.end
188+
189+
if.end:
190+
%phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
191+
%ret = add i32 %phi, %load0
192+
ret i32 %ret
193+
}
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
3+
4+
target triple = "x86_64--linux-gnu"
5+
6+
@foo = dso_local thread_local(localexec) global i32 0, align 4
7+
8+
declare void @effect()
9+
declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
10+
11+
define i32 @func0(i32 %arg) {
12+
; CHECK-LABEL: define i32 @func0(
13+
; CHECK-SAME: i32 [[ARG:%.*]]) {
14+
; CHECK-NEXT: entry:
15+
; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
16+
; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
17+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG]], 0
18+
; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
19+
; CHECK: if.then:
20+
; CHECK-NEXT: tail call void @effect()
21+
; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
22+
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
23+
; CHECK-NEXT: br label [[IF_END]]
24+
; CHECK: if.end:
25+
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
26+
; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
27+
; CHECK-NEXT: ret i32 [[RET]]
28+
;
29+
entry:
30+
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
31+
%load0 = load i32, ptr %addr, align 4
32+
%cond = icmp eq i32 %arg, 0
33+
br i1 %cond, label %if.then, label %if.end
34+
35+
if.then:
36+
tail call void @effect()
37+
%load1 = load i32, ptr %addr, align 4
38+
br label %if.end
39+
40+
if.end:
41+
%phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
42+
%ret = add i32 %phi, %load0
43+
ret i32 %ret
44+
}
45+
46+
define i32 @func1(i32 %arg0, i32 %arg1) {
47+
; CHECK-LABEL: define i32 @func1(
48+
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
49+
; CHECK-NEXT: entry:
50+
; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
51+
; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
52+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG0]], 0
53+
; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
54+
; CHECK: if.then:
55+
; CHECK-NEXT: tail call void @effect()
56+
; CHECK-NEXT: [[X:%.*]] = add i32 [[ARG1]], 42
57+
; CHECK-NEXT: [[X64:%.*]] = sext i32 [[X]] to i64
58+
; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
59+
; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[X64]], 4
60+
; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[SUNKADDR]]
61+
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ADDR1]], align 4
62+
; CHECK-NEXT: br label [[IF_END]]
63+
; CHECK: if.end:
64+
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
65+
; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
66+
; CHECK-NEXT: ret i32 [[RET]]
67+
;
68+
entry:
69+
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
70+
%load0 = load i32, ptr %addr, align 4
71+
%cond = icmp eq i32 %arg0, 0
72+
br i1 %cond, label %if.then, label %if.end
73+
74+
if.then:
75+
tail call void @effect()
76+
%x = add i32 %arg1, 42
77+
%x64 = sext i32 %x to i64
78+
%addr1 = getelementptr inbounds i32, ptr %addr, i64 %x64
79+
%load1 = load i32, ptr %addr1, align 4
80+
br label %if.end
81+
82+
if.end:
83+
%phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
84+
%ret = add i32 %phi, %load0
85+
ret i32 %ret
86+
}

0 commit comments

Comments
 (0)