Skip to content

Commit 1eb4f88

Browse files
committed
[x86] add test coverage for AMD Ryzen fast sqrt codegen; NFC
1 parent b8fbffc commit 1eb4f88

File tree

1 file changed

+147
-0
lines changed

1 file changed

+147
-0
lines changed

llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB
44
; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell | FileCheck %s --check-prefixes=FAST-SCALAR,BDW
55
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,SKL
6+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=SLOW-SCALAR,ZN1
7+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=SLOW-SCALAR,ZN3
68

79
define float @f32_no_daz(float %f) #0 {
810
; NHM-LABEL: f32_no_daz:
@@ -24,6 +26,19 @@ define float @f32_no_daz(float %f) #0 {
2426
; FAST-SCALAR: # %bb.0:
2527
; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
2628
; FAST-SCALAR-NEXT: retq
29+
;
30+
; SLOW-SCALAR-LABEL: f32_no_daz:
31+
; SLOW-SCALAR: # %bb.0:
32+
; SLOW-SCALAR-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
33+
; SLOW-SCALAR-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
34+
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm0, %xmm2
35+
; SLOW-SCALAR-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
36+
; SLOW-SCALAR-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
37+
; SLOW-SCALAR-NEXT: vandps %xmm3, %xmm0, %xmm0
38+
; SLOW-SCALAR-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
39+
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm2, %xmm1
40+
; SLOW-SCALAR-NEXT: vandnps %xmm1, %xmm0, %xmm0
41+
; SLOW-SCALAR-NEXT: retq
2742
%call = tail call fast float @llvm.sqrt.f32(float %f) #2
2843
ret float %call
2944
}
@@ -80,6 +95,38 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
8095
; SKL: # %bb.0:
8196
; SKL-NEXT: vsqrtps %xmm0, %xmm0
8297
; SKL-NEXT: retq
98+
;
99+
; ZN1-LABEL: v4f32_no_daz:
100+
; ZN1: # %bb.0:
101+
; ZN1-NEXT: vrsqrtps %xmm0, %xmm1
102+
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
103+
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
104+
; ZN1-NEXT: vmulps %xmm1, %xmm0, %xmm2
105+
; ZN1-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
106+
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
107+
; ZN1-NEXT: vandps %xmm4, %xmm0, %xmm0
108+
; ZN1-NEXT: vmulps %xmm1, %xmm2, %xmm1
109+
; ZN1-NEXT: vmulps %xmm3, %xmm1, %xmm1
110+
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
111+
; ZN1-NEXT: vcmpleps %xmm0, %xmm3, %xmm0
112+
; ZN1-NEXT: vandps %xmm1, %xmm0, %xmm0
113+
; ZN1-NEXT: retq
114+
;
115+
; ZN3-LABEL: v4f32_no_daz:
116+
; ZN3: # %bb.0:
117+
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
118+
; ZN3-NEXT: vrsqrtps %xmm0, %xmm1
119+
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
120+
; ZN3-NEXT: vmulps %xmm1, %xmm0, %xmm2
121+
; ZN3-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
122+
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
123+
; ZN3-NEXT: vandps %xmm4, %xmm0, %xmm0
124+
; ZN3-NEXT: vmulps %xmm1, %xmm2, %xmm1
125+
; ZN3-NEXT: vmulps %xmm3, %xmm1, %xmm1
126+
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
127+
; ZN3-NEXT: vcmpleps %xmm0, %xmm3, %xmm0
128+
; ZN3-NEXT: vandps %xmm1, %xmm0, %xmm0
129+
; ZN3-NEXT: retq
83130
%call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
84131
ret <4 x float> %call
85132
}
@@ -151,6 +198,38 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
151198
; SKL: # %bb.0:
152199
; SKL-NEXT: vsqrtps %ymm0, %ymm0
153200
; SKL-NEXT: retq
201+
;
202+
; ZN1-LABEL: v8f32_no_daz:
203+
; ZN1: # %bb.0:
204+
; ZN1-NEXT: vrsqrtps %ymm0, %ymm1
205+
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
206+
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
207+
; ZN1-NEXT: vmulps %ymm1, %ymm0, %ymm2
208+
; ZN1-NEXT: vandps %ymm4, %ymm0, %ymm0
209+
; ZN1-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
210+
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
211+
; ZN1-NEXT: vmulps %ymm1, %ymm2, %ymm1
212+
; ZN1-NEXT: vmulps %ymm3, %ymm1, %ymm1
213+
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
214+
; ZN1-NEXT: vcmpleps %ymm0, %ymm3, %ymm0
215+
; ZN1-NEXT: vandps %ymm1, %ymm0, %ymm0
216+
; ZN1-NEXT: retq
217+
;
218+
; ZN3-LABEL: v8f32_no_daz:
219+
; ZN3: # %bb.0:
220+
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
221+
; ZN3-NEXT: vrsqrtps %ymm0, %ymm1
222+
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
223+
; ZN3-NEXT: vmulps %ymm1, %ymm0, %ymm2
224+
; ZN3-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
225+
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
226+
; ZN3-NEXT: vandps %ymm4, %ymm0, %ymm0
227+
; ZN3-NEXT: vmulps %ymm1, %ymm2, %ymm1
228+
; ZN3-NEXT: vmulps %ymm3, %ymm1, %ymm1
229+
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
230+
; ZN3-NEXT: vcmpleps %ymm0, %ymm3, %ymm0
231+
; ZN3-NEXT: vandps %ymm1, %ymm0, %ymm0
232+
; ZN3-NEXT: retq
154233
%call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
155234
ret <8 x float> %call
156235
}
@@ -177,6 +256,18 @@ define float @f32_daz(float %f) #1 {
177256
; FAST-SCALAR: # %bb.0:
178257
; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
179258
; FAST-SCALAR-NEXT: retq
259+
;
260+
; SLOW-SCALAR-LABEL: f32_daz:
261+
; SLOW-SCALAR: # %bb.0:
262+
; SLOW-SCALAR-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
263+
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm0, %xmm2
264+
; SLOW-SCALAR-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
265+
; SLOW-SCALAR-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
266+
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm2, %xmm1
267+
; SLOW-SCALAR-NEXT: vxorps %xmm2, %xmm2, %xmm2
268+
; SLOW-SCALAR-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
269+
; SLOW-SCALAR-NEXT: vandnps %xmm1, %xmm0, %xmm0
270+
; SLOW-SCALAR-NEXT: retq
180271
%call = tail call fast float @llvm.sqrt.f32(float %f) #2
181272
ret float %call
182273
}
@@ -228,6 +319,34 @@ define <4 x float> @v4f32_daz(<4 x float> %f) #1 {
228319
; SKL: # %bb.0:
229320
; SKL-NEXT: vsqrtps %xmm0, %xmm0
230321
; SKL-NEXT: retq
322+
;
323+
; ZN1-LABEL: v4f32_daz:
324+
; ZN1: # %bb.0:
325+
; ZN1-NEXT: vrsqrtps %xmm0, %xmm1
326+
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
327+
; ZN1-NEXT: vmulps %xmm1, %xmm0, %xmm2
328+
; ZN1-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
329+
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
330+
; ZN1-NEXT: vmulps %xmm1, %xmm2, %xmm1
331+
; ZN1-NEXT: vxorps %xmm2, %xmm2, %xmm2
332+
; ZN1-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0
333+
; ZN1-NEXT: vmulps %xmm3, %xmm1, %xmm1
334+
; ZN1-NEXT: vandps %xmm1, %xmm0, %xmm0
335+
; ZN1-NEXT: retq
336+
;
337+
; ZN3-LABEL: v4f32_daz:
338+
; ZN3: # %bb.0:
339+
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
340+
; ZN3-NEXT: vrsqrtps %xmm0, %xmm1
341+
; ZN3-NEXT: vmulps %xmm1, %xmm0, %xmm2
342+
; ZN3-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
343+
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
344+
; ZN3-NEXT: vmulps %xmm1, %xmm2, %xmm1
345+
; ZN3-NEXT: vxorps %xmm2, %xmm2, %xmm2
346+
; ZN3-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0
347+
; ZN3-NEXT: vmulps %xmm3, %xmm1, %xmm1
348+
; ZN3-NEXT: vandps %xmm1, %xmm0, %xmm0
349+
; ZN3-NEXT: retq
231350
%call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
232351
ret <4 x float> %call
233352
}
@@ -290,6 +409,34 @@ define <8 x float> @v8f32_daz(<8 x float> %f) #1 {
290409
; SKL: # %bb.0:
291410
; SKL-NEXT: vsqrtps %ymm0, %ymm0
292411
; SKL-NEXT: retq
412+
;
413+
; ZN1-LABEL: v8f32_daz:
414+
; ZN1: # %bb.0:
415+
; ZN1-NEXT: vrsqrtps %ymm0, %ymm1
416+
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
417+
; ZN1-NEXT: vmulps %ymm1, %ymm0, %ymm2
418+
; ZN1-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
419+
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
420+
; ZN1-NEXT: vmulps %ymm1, %ymm2, %ymm1
421+
; ZN1-NEXT: vxorps %xmm2, %xmm2, %xmm2
422+
; ZN1-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
423+
; ZN1-NEXT: vmulps %ymm3, %ymm1, %ymm1
424+
; ZN1-NEXT: vandps %ymm1, %ymm0, %ymm0
425+
; ZN1-NEXT: retq
426+
;
427+
; ZN3-LABEL: v8f32_daz:
428+
; ZN3: # %bb.0:
429+
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
430+
; ZN3-NEXT: vrsqrtps %ymm0, %ymm1
431+
; ZN3-NEXT: vmulps %ymm1, %ymm0, %ymm2
432+
; ZN3-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
433+
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
434+
; ZN3-NEXT: vmulps %ymm1, %ymm2, %ymm1
435+
; ZN3-NEXT: vxorps %xmm2, %xmm2, %xmm2
436+
; ZN3-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
437+
; ZN3-NEXT: vmulps %ymm3, %ymm1, %ymm1
438+
; ZN3-NEXT: vandps %ymm1, %ymm0, %ymm0
439+
; ZN3-NEXT: retq
293440
%call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
294441
ret <8 x float> %call
295442
}

0 commit comments

Comments
 (0)