add amd_vrs16_tanhf support. remove sinh test declares as amdlibm doesn't support it

farzonl · farzonl · commit bc94a99b8804 · 2024-07-17T14:22:22.000-04:00
https://github.com/amd/aocl-libm-ose/blob/9c0b67293ba01e509a6308247d82a8f1adfbbc67/scripts/libalm.def#L224
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -1331,6 +1331,11 @@ TLI_DEFINE_VECFUNC("llvm.cosh.f32", "amd_vrs8_coshf", FIXED(8), NOMASK, "_ZGV_LL
 
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs4_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("tanhf", "amd_vrs8_tanhf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("tanhf", "amd_vrs16_tanhf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
+
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs4_tanhf",  FIXED(4),  NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs8_tanhf",  FIXED(8),  NOMASK, "_ZGV_LLVM_N8v")
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "amd_vrs16_tanhf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v")
 
 TLI_DEFINE_VECFUNC("cbrt", "amd_vrd2_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll
@@ -39,14 +39,7 @@ declare float @coshf(float) #0
 declare double @llvm.cosh.f64(double) #0
 declare float @llvm.cosh.f32(float) #0
 
-declare double @sinh(double) #0
-declare float @sinhf(float) #0
-declare double @llvm.sinh.f64(double) #0
-declare float @llvm.sinh.f32(float) #0
-
-declare double @tanh(double) #0
 declare float @tanhf(float) #0
-declare double @llvm.tanh.f64(double) #0
 declare float @llvm.tanh.f32(float) #0
 
 declare double @pow(double, double) #0
@@ -303,6 +296,10 @@ define void @tan_f64(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -357,6 +354,10 @@ define void @tan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @tan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_tan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @tan_f64_intrinsic(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -565,6 +566,10 @@ define void @atan_f64(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @atan_f64(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @atan_f64(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -619,6 +624,10 @@ define void @atan_f64_intrinsic(ptr nocapture %varray) {
 ; CHECK:    [[TMP5:%.*]] = call <4 x double> @amd_vrd4_atan(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
+; CHECK-AVX-VF2-LABEL: @atan_f64_intrinsic(
+; CHECK-AVX-VF2:    [[TMP5:%.*]] = call <2 x double> @amd_vrd2_atan(<2 x double> [[TMP4:%.*]])
+; CHECK-AVX-VF2:    ret void
+;
 ; CHECK-AVX512-VF8-LABEL: @atan_f64_intrinsic(
 ; CHECK-AVX512-VF8:    [[TMP5:%.*]] = call <8 x double> @amd_vrd8_atan(<8 x double> [[TMP4:%.*]])
 ; CHECK-AVX512-VF8:    ret void
@@ -760,6 +769,60 @@ for.end:
   ret void
 }
 
+define void @tanh_f32(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @tanh_f32(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanhf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @tanhf(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @tanh_f32_intrinsic(ptr nocapture %varray) {
+; CHECK-LABEL: @tanh_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanhf(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+; CHECK-AVX512-VF16-LABEL: @tanh_f32_intrinsic(
+; CHECK-AVX512-VF16:    [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanhf(<16 x float> [[TMP4:%.*]])
+; CHECK-AVX512-VF16:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.tanh.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv
+  store float %call, ptr %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
 ; CHECK:    [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])