Skip to content

Commit a3efc53

Browse files
authored
[AIX][TLS] Produce a faster local-exec access sequence for the "aix-small-tls" global variable attribute (llvm#83053)
Similar to 3f46e54, this patch allows the backend to produce a faster access sequence for the local-exec TLS model, where loading from the TOC can be avoided, for local-exec TLS variables that are annotated with the "aix-small-tls" attribute. The expectation is for local-exec TLS variables to be set with this attribute through PGO. Furthermore, the optimized access sequence is only generated for local-exec TLS variables annotated with "aix-small-tls", only if they are less than ~32KB in size.
1 parent eacda36 commit a3efc53

File tree

5 files changed

+416
-15
lines changed

5 files changed

+416
-15
lines changed

llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7558,6 +7558,16 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
75587558
DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
75597559
}
75607560

7561+
// Check if an SDValue has the 'aix-small-tls' global variable attribute.
7562+
static bool hasAIXSmallTLSAttr(SDValue Val) {
7563+
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val))
7564+
if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal()))
7565+
if (GV->hasAttribute("aix-small-tls"))
7566+
return true;
7567+
7568+
return false;
7569+
}
7570+
75617571
// Is an ADDI eligible for folding for non-TOC-based local-exec accesses?
75627572
static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
75637573
SDValue ADDIToFold) {
@@ -7567,20 +7577,25 @@ static bool isEligibleToFoldADDIForLocalExecAccesses(SelectionDAG *DAG,
75677577
(ADDIToFold.getMachineOpcode() != PPC::ADDI8))
75687578
return false;
75697579

7580+
// Folding is only allowed for the AIX small-local-exec TLS target attribute
7581+
// or when the 'aix-small-tls' global variable attribute is present.
7582+
const PPCSubtarget &Subtarget =
7583+
DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
7584+
SDValue TLSVarNode = ADDIToFold.getOperand(1);
7585+
if (!(Subtarget.hasAIXSmallLocalExecTLS() || hasAIXSmallTLSAttr(TLSVarNode)))
7586+
return false;
7587+
75707588
// The first operand of the ADDIToFold should be the thread pointer.
75717589
// This transformation is only performed if the first operand of the
75727590
// addi is the thread pointer.
75737591
SDValue TPRegNode = ADDIToFold.getOperand(0);
75747592
RegisterSDNode *TPReg = dyn_cast<RegisterSDNode>(TPRegNode.getNode());
7575-
const PPCSubtarget &Subtarget =
7576-
DAG->getMachineFunction().getSubtarget<PPCSubtarget>();
75777593
if (!TPReg || (TPReg->getReg() != Subtarget.getThreadPointerRegister()))
75787594
return false;
75797595

75807596
// The second operand of the ADDIToFold should be the global TLS address
75817597
// (the local-exec TLS variable). We only perform the folding if the TLS
75827598
// variable is the second operand.
7583-
SDValue TLSVarNode = ADDIToFold.getOperand(1);
75847599
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(TLSVarNode);
75857600
if (!GA)
75867601
return false;
@@ -7649,7 +7664,6 @@ static void foldADDIForLocalExecAccesses(SDNode *N, SelectionDAG *DAG) {
76497664

76507665
void PPCDAGToDAGISel::PeepholePPC64() {
76517666
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
7652-
bool HasAIXSmallLocalExecTLS = Subtarget->hasAIXSmallLocalExecTLS();
76537667

76547668
while (Position != CurDAG->allnodes_begin()) {
76557669
SDNode *N = &*--Position;
@@ -7661,8 +7675,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
76617675
reduceVSXSwap(N, CurDAG);
76627676

76637677
// This optimization is performed for non-TOC-based local-exec accesses.
7664-
if (HasAIXSmallLocalExecTLS)
7665-
foldADDIForLocalExecAccesses(N, CurDAG);
7678+
foldADDIForLocalExecAccesses(N, CurDAG);
76667679

76677680
unsigned FirstOp;
76687681
unsigned StorageOpcode = N->getMachineOpcode();
@@ -7821,8 +7834,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
78217834
ImmOpnd.getValueType());
78227835
} else if (Offset != 0) {
78237836
// This optimization is performed for non-TOC-based local-exec accesses.
7824-
if (HasAIXSmallLocalExecTLS &&
7825-
isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
7837+
if (isEligibleToFoldADDIForLocalExecAccesses(CurDAG, Base)) {
78267838
// Add the non-zero offset information into the load or store
78277839
// instruction to be used for non-TOC-based local-exec accesses.
78287840
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3367,15 +3367,21 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
33673367
const GlobalValue *GV = GA->getGlobal();
33683368
EVT PtrVT = getPointerTy(DAG.getDataLayout());
33693369
bool Is64Bit = Subtarget.isPPC64();
3370-
bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
33713370
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
33723371
bool IsTLSLocalExecModel = Model == TLSModel::LocalExec;
33733372

33743373
if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) {
3374+
bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS();
3375+
bool HasAIXSmallTLSGlobalAttr = false;
33753376
SDValue VariableOffsetTGA =
33763377
DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
33773378
SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
33783379
SDValue TLSReg;
3380+
3381+
if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
3382+
if (GVar->hasAttribute("aix-small-tls"))
3383+
HasAIXSmallTLSGlobalAttr = true;
3384+
33793385
if (Is64Bit) {
33803386
// For local-exec and initial-exec on AIX (64-bit), the sequence generated
33813387
// involves a load of the variable offset (from the TOC), followed by an
@@ -3385,14 +3391,16 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
33853391
// add reg2, reg1, r13 // r13 contains the thread pointer
33863392
TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
33873393

3388-
// With the -maix-small-local-exec-tls option, produce a faster access
3389-
// sequence for local-exec TLS variables where the offset from the TLS
3390-
// base is encoded as an immediate operand.
3394+
// With the -maix-small-local-exec-tls option, or with the "aix-small-tls"
3395+
// global variable attribute, produce a faster access sequence for
3396+
// local-exec TLS variables where the offset from the TLS base is encoded
3397+
// as an immediate operand.
33913398
//
33923399
// We only utilize the faster local-exec access sequence when the TLS
33933400
// variable has a size within the policy limit. We treat types that are
33943401
// not sized or are empty as being over the policy size limit.
3395-
if (HasAIXSmallLocalExecTLS && IsTLSLocalExecModel) {
3402+
if ((HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr) &&
3403+
IsTLSLocalExecModel) {
33963404
Type *GVType = GV->getValueType();
33973405
if (GVType->isSized() && !GVType->isEmptyTy() &&
33983406
GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <=
@@ -3410,8 +3418,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
34103418
TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);
34113419

34123420
// We do not implement the 32-bit version of the faster access sequence
3413-
// for local-exec that is controlled by -maix-small-local-exec-tls.
3414-
if (HasAIXSmallLocalExecTLS)
3421+
// for local-exec that is controlled by the -maix-small-local-exec-tls
3422+
// option, or the "aix-small-tls" global variable attribute.
3423+
if (HasAIXSmallLocalExecTLS || HasAIXSmallTLSGlobalAttr)
34153424
report_fatal_error("The small-local-exec TLS access sequence is "
34163425
"currently only supported on AIX (64-bit mode).");
34173426
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
2+
; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s \
3+
; RUN: | FileCheck %s --check-prefixes=COMMONCM,CHECK-SMALLCM64
4+
; RUN: llc -verify-machineinstrs -mcpu=pwr7 -ppc-asm-full-reg-names \
5+
; RUN: -mtriple powerpc64-ibm-aix-xcoff --code-model=large \
6+
; RUN: < %s | FileCheck %s --check-prefixes=COMMONCM,CHECK-LARGECM64
7+
8+
@mySmallTLS = thread_local(localexec) global [7800 x i64] zeroinitializer, align 8 #0
9+
@mySmallTLS2 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8 #0
10+
@mySmallTLS3 = thread_local(localexec) global [3000 x i64] zeroinitializer, align 8
11+
declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
12+
13+
; All accesses use a "faster" local-exec sequence directly off the thread pointer,
14+
; except for mySmallTLS, as this variable is over the 32KB size limit.
15+
define i64 @StoreLargeAccess1() #1 {
16+
; COMMONCM-LABEL: StoreLargeAccess1:
17+
; COMMONCM-NEXT: # %bb.0: # %entry
18+
; CHECK-SMALLCM64: ld r3, L..C0(r2) # target-flags(ppc-tprel) @mySmallTLS
19+
; CHECK-SMALLCM64-NEXT: li r4, 0
20+
; CHECK-SMALLCM64-NEXT: li r5, 23
21+
; CHECK-LARGECM64: addis r3, L..C0@u(r2)
22+
; CHECK-LARGECM64-NEXT: li r4, 0
23+
; CHECK-LARGECM64-NEXT: li r5, 23
24+
; CHECK-LARGECM64-NEXT: ld r3, L..C0@l(r3)
25+
; COMMONCM: ori r4, r4, 53328
26+
; COMMONCM-NEXT: add r3, r13, r3
27+
; COMMONCM-NEXT: stdx r5, r3, r4
28+
; COMMONCM-NEXT: li r3, 55
29+
; COMMONCM-NEXT: li r4, 64
30+
; COMMONCM-NEXT: std r3, (mySmallTLS2[TL]@le+696)-65536(r13)
31+
; COMMONCM-NEXT: li r3, 142
32+
; COMMONCM-NEXT: std r4, (mySmallTLS3[TL]@le+20000)-131072(r13)
33+
; COMMONCM-NEXT: blr
34+
entry:
35+
%tls0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS)
36+
%arrayidx = getelementptr inbounds i8, ptr %tls0, i32 53328
37+
store i64 23, ptr %arrayidx, align 8
38+
%tls1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS2)
39+
%arrayidx1 = getelementptr inbounds i8, ptr %tls1, i32 696
40+
store i64 55, ptr %arrayidx1, align 8
41+
%tls2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS3)
42+
%arrayidx2 = getelementptr inbounds i8, ptr %tls2, i32 20000
43+
store i64 64, ptr %arrayidx2, align 8
44+
%load1 = load i64, ptr %arrayidx, align 8
45+
%load2 = load i64, ptr %arrayidx1, align 8
46+
%add1 = add i64 %load1, 64
47+
%add2 = add i64 %add1, %load2
48+
ret i64 %add2
49+
}
50+
51+
; Since this function does not have the 'aix-small-local-exec-tls` attribute,
52+
; only some local-exec variables should have the small-local-exec TLS access
53+
; sequence (as opposed to all of them).
54+
define i64 @StoreLargeAccess2() {
55+
; COMMONCM-LABEL: StoreLargeAccess2:
56+
; COMMONCM-NEXT: # %bb.0: # %entry
57+
; CHECK-SMALLCM64: ld r5, L..C0(r2) # target-flags(ppc-tprel) @mySmallTLS
58+
; CHECK-SMALLCM64-NEXT: li r3, 0
59+
; CHECK-SMALLCM64-NEXT: li r4, 23
60+
; CHECK-SMALLCM64-NEXT: ori r3, r3, 53328
61+
; CHECK-SMALLCM64-NEXT: add r5, r13, r5
62+
; CHECK-SMALLCM64-NEXT: stdx r4, r5, r3
63+
; CHECK-SMALLCM64-NEXT: ld r5, L..C1(r2) # target-flags(ppc-tprel) @mySmallTLS3
64+
; CHECK-SMALLCM64-NEXT: li r3, 55
65+
; CHECK-SMALLCM64-NEXT: li r4, 64
66+
; CHECK-SMALLCM64-NEXT: std r3, mySmallTLS2[TL]@le+696(r13)
67+
; CHECK-SMALLCM64-NEXT: li r3, 142
68+
; CHECK-SMALLCM64-NEXT: add r5, r13, r5
69+
; CHECK-SMALLCM64-NEXT: std r4, 20000(r5)
70+
; CHECK-LARGECM64: addis r3, L..C0@u(r2)
71+
; CHECK-LARGECM64-NEXT: li r4, 0
72+
; CHECK-LARGECM64-NEXT: li r5, 23
73+
; CHECK-LARGECM64-NEXT: ld r3, L..C0@l(r3)
74+
; CHECK-LARGECM64-NEXT: ori r4, r4, 53328
75+
; CHECK-LARGECM64-NEXT: add r3, r13, r3
76+
; CHECK-LARGECM64-NEXT: stdx r5, r3, r4
77+
; CHECK-LARGECM64-NEXT: addis r3, L..C1@u(r2)
78+
; CHECK-LARGECM64-NEXT: li r4, 55
79+
; CHECK-LARGECM64-NEXT: li r5, 64
80+
; CHECK-LARGECM64-NEXT: ld r3, L..C1@l(r3)
81+
; CHECK-LARGECM64-NEXT: std r4, mySmallTLS2[TL]@le+696(r13)
82+
; CHECK-LARGECM64-NEXT: add r3, r13, r3
83+
; CHECK-LARGECM64-NEXT: std r5, 20000(r3)
84+
; CHECK-LARGECM64-NEXT: li r3, 142
85+
; COMMONCM-NEXT: blr
86+
;
87+
entry:
88+
%tls0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS)
89+
%arrayidx = getelementptr inbounds i8, ptr %tls0, i32 53328
90+
store i64 23, ptr %arrayidx, align 8
91+
%tls1 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS2)
92+
%arrayidx1 = getelementptr inbounds i8, ptr %tls1, i32 696
93+
store i64 55, ptr %arrayidx1, align 8
94+
%tls2 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS3)
95+
%arrayidx2 = getelementptr inbounds i8, ptr %tls2, i32 20000
96+
store i64 64, ptr %arrayidx2, align 8
97+
%load1 = load i64, ptr %arrayidx, align 8
98+
%load2 = load i64, ptr %arrayidx1, align 8
99+
%add1 = add i64 %load1, 64
100+
%add2 = add i64 %add1, %load2
101+
ret i64 %add2
102+
}
103+
104+
attributes #0 = { "aix-small-tls" }
105+
attributes #1 = { "target-features"="+aix-small-local-exec-tls" }

0 commit comments

Comments
 (0)