Skip to content

Commit 54bda79

Browse files
committed
AMDGPU: Simplify and improve sincos matching
The first trivial example I tried failed to merge due to the user scan logic. Remove the complicated scan of users handling with distance thresholds, with a same block restriction. The actual expansion of sincos is basically the same size as sin or cos individually. Copy the technique the generic optimization uses, which is to just use the input instruction as the insert point or just insert at the start of the entry block. https://reviews.llvm.org/D156706
1 parent 660b740 commit 54bda79

File tree

5 files changed

+370
-317
lines changed

5 files changed

+370
-317
lines changed

llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp

Lines changed: 79 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -91,18 +91,16 @@ class AMDGPULibCalls {
9191
// sqrt
9292
bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
9393

94+
bool insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
95+
const FuncInfo &FInfo);
96+
9497
// sin/cos
9598
bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
9699

97100
// __read_pipe/__write_pipe
98101
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
99102
const FuncInfo &FInfo);
100103

101-
// Get insertion point at entry.
102-
BasicBlock::iterator getEntryIns(CallInst * UI);
103-
// Insert an Alloc instruction.
104-
AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
105-
106104
// Get a scalar native builtin single argument FP function
107105
FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
108106

@@ -1153,6 +1151,71 @@ bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
11531151
return false;
11541152
}
11551153

1154+
bool AMDGPULibCalls::insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
1155+
const FuncInfo &fInfo) {
1156+
Value *Arg = Sin->getOperand(0);
1157+
assert(Arg == Cos->getOperand(0));
1158+
1159+
Function *F = B.GetInsertBlock()->getParent();
1160+
Module *M = F->getParent();
1161+
// Merge the sin and cos.
1162+
1163+
// for OpenCL 2.0 we have only generic implementation of sincos
1164+
// function.
1165+
// FIXME: This is not true anymore
1166+
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
1167+
nf.getLeads()[0].PtrKind =
1168+
AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
1169+
FunctionCallee Fsincos = getFunction(M, nf);
1170+
if (!Fsincos)
1171+
return false;
1172+
1173+
B.SetInsertPointPastAllocas(F);
1174+
1175+
DILocation *MergedDebugLoc =
1176+
DILocation::getMergedLocation(Sin->getDebugLoc(), Cos->getDebugLoc());
1177+
B.SetCurrentDebugLocation(MergedDebugLoc);
1178+
1179+
AllocaInst *Alloc = B.CreateAlloca(Sin->getType(), nullptr, "__sincos_");
1180+
1181+
if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
1182+
// If the argument is an instruction, it must dominate all uses so put our
1183+
// sincos call there. Otherwise, right after the allocas works well enough
1184+
// if it's an argument or constant.
1185+
1186+
B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
1187+
B.SetCurrentDebugLocation(MergedDebugLoc);
1188+
}
1189+
1190+
Value *P = Alloc;
1191+
Type *PTy = Fsincos.getFunctionType()->getParamType(1);
1192+
// The allocaInst allocates the memory in private address space. This need
1193+
// to be bitcasted to point to the address space of cos pointer type.
1194+
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
1195+
if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
1196+
P = B.CreateAddrSpaceCast(Alloc, PTy);
1197+
1198+
// Intersect the two sets of flags.
1199+
FastMathFlags FMF = cast<FPMathOperator>(Sin)->getFastMathFlags();
1200+
FMF &= cast<FPMathOperator>(Cos)->getFastMathFlags();
1201+
B.setFastMathFlags(FMF);
1202+
1203+
CallInst *Call = CreateCallEx2(B, Fsincos, Arg, P);
1204+
LoadInst *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1205+
Reload->setDebugLoc(Cos->getDebugLoc());
1206+
1207+
LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *Sin << ", " << *Cos
1208+
<< ") with " << *Call << '\n');
1209+
1210+
Sin->replaceAllUsesWith(Call);
1211+
Sin->eraseFromParent();
1212+
1213+
Cos->replaceAllUsesWith(Reload);
1214+
Cos->eraseFromParent();
1215+
1216+
return true;
1217+
}
1218+
11561219
// fold sin, cos -> sincos.
11571220
bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
11581221
const FuncInfo &fInfo) {
@@ -1168,106 +1231,33 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
11681231

11691232
Value *CArgVal = FPOp->getOperand(0);
11701233
CallInst *CI = cast<CallInst>(FPOp);
1171-
BasicBlock * const CBB = CI->getParent();
1172-
1173-
int const MaxScan = 30;
11741234
bool Changed = false;
11751235

1176-
Module *M = CI->getModule();
11771236
FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
11781237
fInfo);
11791238
const std::string PairName = PartnerInfo.mangle();
11801239

11811240
CallInst *UI = nullptr;
1241+
1242+
// TODO: Handle repeated uses, the generic implementation does.
11821243
for (User* U : CArgVal->users()) {
1183-
CallInst *XI = dyn_cast_or_null<CallInst>(U);
1184-
if (!XI || XI == CI || XI->getParent() != CBB)
1244+
CallInst *XI = dyn_cast<CallInst>(U);
1245+
if (!XI || XI->isNoBuiltin())
11851246
continue;
11861247

11871248
Function *UCallee = XI->getCalledFunction();
1188-
if (!UCallee || !UCallee->getName().equals(PairName))
1189-
continue;
1190-
1191-
BasicBlock::iterator BBI = CI->getIterator();
1192-
if (BBI == CI->getParent()->begin())
1193-
break;
1194-
--BBI;
1195-
for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
1196-
if (cast<Instruction>(BBI) == XI) {
1197-
UI = XI;
1198-
break;
1199-
}
1200-
}
1201-
if (UI) break;
1249+
if (UCallee && UCallee->getName().equals(PairName))
1250+
UI = XI;
1251+
else if (UI)
1252+
return Changed;
12021253
}
12031254

12041255
if (!UI)
12051256
return Changed;
12061257

1207-
// Merge the sin and cos.
1208-
1209-
// for OpenCL 2.0 we have only generic implementation of sincos
1210-
// function.
1211-
AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
1212-
nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
1213-
FunctionCallee Fsincos = getFunction(M, nf);
1214-
if (!Fsincos)
1215-
return Changed;
1216-
1217-
BasicBlock::iterator ItOld = B.GetInsertPoint();
1218-
AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
1219-
B.SetInsertPoint(UI);
1220-
1221-
Value *P = Alloc;
1222-
Type *PTy = Fsincos.getFunctionType()->getParamType(1);
1223-
// The allocaInst allocates the memory in private address space. This need
1224-
// to be bitcasted to point to the address space of cos pointer type.
1225-
// In OpenCL 2.0 this is generic, while in 1.2 that is private.
1226-
if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
1227-
P = B.CreateAddrSpaceCast(Alloc, PTy);
1228-
CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
1229-
1230-
LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
1231-
<< *Call << "\n");
1232-
1233-
if (!isSin) { // CI->cos, UI->sin
1234-
B.SetInsertPoint(&*ItOld);
1235-
UI->replaceAllUsesWith(&*Call);
1236-
Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1237-
CI->replaceAllUsesWith(Reload);
1238-
UI->eraseFromParent();
1239-
CI->eraseFromParent();
1240-
} else { // CI->sin, UI->cos
1241-
Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1242-
UI->replaceAllUsesWith(Reload);
1243-
CI->replaceAllUsesWith(Call);
1244-
UI->eraseFromParent();
1245-
CI->eraseFromParent();
1246-
}
1247-
return true;
1248-
}
1249-
1250-
// Get insertion point at entry.
1251-
BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
1252-
Function * Func = UI->getParent()->getParent();
1253-
BasicBlock * BB = &Func->getEntryBlock();
1254-
assert(BB && "Entry block not found!");
1255-
BasicBlock::iterator ItNew = BB->begin();
1256-
return ItNew;
1257-
}
1258-
1259-
// Insert a AllocsInst at the beginning of function entry block.
1260-
AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
1261-
const char *prefix) {
1262-
BasicBlock::iterator ItNew = getEntryIns(UI);
1263-
Function *UCallee = UI->getCalledFunction();
1264-
Type *RetType = UCallee->getReturnType();
1265-
B.SetInsertPoint(&*ItNew);
1266-
AllocaInst *Alloc =
1267-
B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
1268-
Alloc->setAlignment(
1269-
Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
1270-
return Alloc;
1258+
CallInst *Sin = isSin ? CI : UI;
1259+
CallInst *Cos = isSin ? UI : CI;
1260+
return insertSinCos(Sin, Cos, B, fInfo) || Changed;
12711261
}
12721262

12731263
bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,12 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
105105
; CHECK-LABEL: define void @sincos_f32
106106
; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
107107
; CHECK-NEXT: entry:
108-
; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
109-
; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
110-
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
111-
; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
108+
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
109+
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
110+
; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
111+
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
112+
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
113+
; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
112114
; CHECK-NEXT: ret void
113115
;
114116
entry:
@@ -123,10 +125,12 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ
123125
; CHECK-LABEL: define void @sincos_f32_value_is_same_constantfp
124126
; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
125127
; CHECK-NEXT: entry:
126-
; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float 4.200000e+01)
127-
; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
128-
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01)
129-
; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
128+
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
129+
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
130+
; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]])
131+
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
132+
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
133+
; CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
130134
; CHECK-NEXT: ret void
131135
;
132136
entry:
@@ -150,10 +154,12 @@ define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %
150154
; CHECK-LABEL: define void @sincos_v2f32
151155
; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
152156
; CHECK-NEXT: entry:
153-
; CHECK-NEXT: [[CALL:%.*]] = tail call contract <2 x float> @_Z3sinDv2_f(<2 x float> [[X]])
154-
; CHECK-NEXT: store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
155-
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
156-
; CHECK-NEXT: store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
157+
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
158+
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
159+
; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
160+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
161+
; CHECK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
162+
; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
157163
; CHECK-NEXT: ret void
158164
;
159165
entry:

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,11 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
5656
; CHECK-LABEL: define void @sincos_f32
5757
; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
5858
; CHECK-NEXT: entry:
59-
; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
60-
; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
61-
; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
62-
; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
59+
; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4
60+
; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]])
61+
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4
62+
; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
63+
; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
6364
; CHECK-NEXT: ret void
6465
;
6566
entry:

0 commit comments

Comments
 (0)