llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
Lines changed: 79 additions & 89 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
Lines changed: 79 additions & 89 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll
Lines changed: 18 additions & 12 deletions b/‎llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll
Lines changed: 18 additions & 12 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
Lines changed: 5 additions & 4 deletions b/‎llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
Lines changed: 5 additions & 4 deletions
@@ -91,18 +91,16 @@ class AMDGPULibCalls {
   // sqrt
   bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
+  bool insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
+                    const FuncInfo &FInfo);
+
   // sin/cos
   bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
   // __read_pipe/__write_pipe
   bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
                             const FuncInfo &FInfo);
 
-  // Get insertion point at entry.
-  BasicBlock::iterator getEntryIns(CallInst * UI);
-  // Insert an Alloc instruction.
-  AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
-
   // Get a scalar native builtin single argument FP function
   FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
 
@@ -1153,6 +1151,71 @@ bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
   return false;
 }
 
+bool AMDGPULibCalls::insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
+                                  const FuncInfo &fInfo) {
+  Value *Arg = Sin->getOperand(0);
+  assert(Arg == Cos->getOperand(0));
+
+  Function *F = B.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  // Merge the sin and cos.
+
+  // for OpenCL 2.0 we have only generic implementation of sincos
+  // function.
+  // FIXME: This is not true anymore
+  AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
+  nf.getLeads()[0].PtrKind =
+      AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
+  FunctionCallee Fsincos = getFunction(M, nf);
+  if (!Fsincos)
+    return false;
+
+  B.SetInsertPointPastAllocas(F);
+
+  DILocation *MergedDebugLoc =
+      DILocation::getMergedLocation(Sin->getDebugLoc(), Cos->getDebugLoc());
+  B.SetCurrentDebugLocation(MergedDebugLoc);
+
+  AllocaInst *Alloc = B.CreateAlloca(Sin->getType(), nullptr, "__sincos_");
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there. Otherwise, right after the allocas works well enough
+    // if it's an argument or constant.
+
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+    B.SetCurrentDebugLocation(MergedDebugLoc);
+  }
+
+  Value *P = Alloc;
+  Type *PTy = Fsincos.getFunctionType()->getParamType(1);
+  // The allocaInst allocates the memory in private address space. This need
+  // to be bitcasted to point to the address space of cos pointer type.
+  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
+  if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    P = B.CreateAddrSpaceCast(Alloc, PTy);
+
+  // Intersect the two sets of flags.
+  FastMathFlags FMF = cast<FPMathOperator>(Sin)->getFastMathFlags();
+  FMF &= cast<FPMathOperator>(Cos)->getFastMathFlags();
+  B.setFastMathFlags(FMF);
+
+  CallInst *Call = CreateCallEx2(B, Fsincos, Arg, P);
+  LoadInst *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
+  Reload->setDebugLoc(Cos->getDebugLoc());
+
+  LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *Sin << ", " << *Cos
+                    << ") with " << *Call << '\n');
+
+  Sin->replaceAllUsesWith(Call);
+  Sin->eraseFromParent();
+
+  Cos->replaceAllUsesWith(Reload);
+  Cos->eraseFromParent();
+
+  return true;
+}
+
 // fold sin, cos -> sincos.
 bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
                                  const FuncInfo &fInfo) {
@@ -1168,106 +1231,33 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
 
   Value *CArgVal = FPOp->getOperand(0);
   CallInst *CI = cast<CallInst>(FPOp);
-  BasicBlock * const CBB = CI->getParent();
-
-  int const MaxScan = 30;
   bool Changed = false;
 
-  Module *M = CI->getModule();
   FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
                        fInfo);
   const std::string PairName = PartnerInfo.mangle();
 
   CallInst *UI = nullptr;
+
+  // TODO: Handle repeated uses, the generic implementation does.
   for (User* U : CArgVal->users()) {
-    CallInst *XI = dyn_cast_or_null<CallInst>(U);
-    if (!XI || XI == CI || XI->getParent() != CBB)
+    CallInst *XI = dyn_cast<CallInst>(U);
+    if (!XI || XI->isNoBuiltin())
       continue;
 
     Function *UCallee = XI->getCalledFunction();
-    if (!UCallee || !UCallee->getName().equals(PairName))
-      continue;
-
-    BasicBlock::iterator BBI = CI->getIterator();
-    if (BBI == CI->getParent()->begin())
-      break;
-    --BBI;
-    for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
-      if (cast<Instruction>(BBI) == XI) {
-        UI = XI;
-        break;
-      }
-    }
-    if (UI) break;
+    if (UCallee && UCallee->getName().equals(PairName))
+      UI = XI;
+    else if (UI)
+      return Changed;
   }
 
   if (!UI)
     return Changed;
 
-  // Merge the sin and cos.
-
-  // for OpenCL 2.0 we have only generic implementation of sincos
-  // function.
-  AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
-  nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
-  FunctionCallee Fsincos = getFunction(M, nf);
-  if (!Fsincos)
-    return Changed;
-
-  BasicBlock::iterator ItOld = B.GetInsertPoint();
-  AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
-  B.SetInsertPoint(UI);
-
-  Value *P = Alloc;
-  Type *PTy = Fsincos.getFunctionType()->getParamType(1);
-  // The allocaInst allocates the memory in private address space. This need
-  // to be bitcasted to point to the address space of cos pointer type.
-  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
-  if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
-    P = B.CreateAddrSpaceCast(Alloc, PTy);
-  CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
-
-  LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
-                    << *Call << "\n");
-
-  if (!isSin) { // CI->cos, UI->sin
-    B.SetInsertPoint(&*ItOld);
-    UI->replaceAllUsesWith(&*Call);
-    Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
-    CI->replaceAllUsesWith(Reload);
-    UI->eraseFromParent();
-    CI->eraseFromParent();
-  } else { // CI->sin, UI->cos
-    Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
-    UI->replaceAllUsesWith(Reload);
-    CI->replaceAllUsesWith(Call);
-    UI->eraseFromParent();
-    CI->eraseFromParent();
-  }
-  return true;
-}
-
-// Get insertion point at entry.
-BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
-  Function * Func = UI->getParent()->getParent();
-  BasicBlock * BB = &Func->getEntryBlock();
-  assert(BB && "Entry block not found!");
-  BasicBlock::iterator ItNew = BB->begin();
-  return ItNew;
-}
-
-// Insert a AllocsInst at the beginning of function entry block.
-AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
-                                         const char *prefix) {
-  BasicBlock::iterator ItNew = getEntryIns(UI);
-  Function *UCallee = UI->getCalledFunction();
-  Type *RetType = UCallee->getReturnType();
-  B.SetInsertPoint(&*ItNew);
-  AllocaInst *Alloc =
-      B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
-  Alloc->setAlignment(
-      Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
-  return Alloc;
+  CallInst *Sin = isSin ? CI : UI;
+  CallInst *Cos = isSin ? UI : CI;
+  return insertSinCos(Sin, Cos, B, fInfo) || Changed;
 }
 
 bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
 
@@ -105,10 +105,12 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -123,10 +125,12 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ
 ; CHECK-LABEL: define void @sincos_f32_value_is_same_constantfp
 ; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float 4.200000e+01)
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01)
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -150,10 +154,12 @@ define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %
 ; CHECK-LABEL: define void @sincos_v2f32
 ; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <2 x float> @_Z3sinDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
 
@@ -56,10 +56,11 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry: