1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
- ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2
+ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
3
+ ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
3
4
4
5
define amdgpu_ps float @_amdgpu_ps_main () #0 {
5
- ; GCN-LABEL: _amdgpu_ps_main:
6
- ; GCN: ; %bb.0: ; %.entry
7
- ; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
8
- ; GCN-NEXT: v_mov_b32_e32 v4, 0
9
- ; GCN-NEXT: s_waitcnt vmcnt(0)
10
- ; GCN-NEXT: s_clause 0x1
11
- ; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
12
- ; GCN-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
13
- ; GCN-NEXT: s_waitcnt vmcnt(0)
14
- ; GCN-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
15
- ; GCN-NEXT: s_clause 0x3
16
- ; GCN-NEXT: s_buffer_load_dword s24, s[0:3], 0x5c
17
- ; GCN-NEXT: s_buffer_load_dword s28, s[0:3], 0x7c
18
- ; GCN-NEXT: s_buffer_load_dword s29, s[0:3], 0xc0
19
- ; GCN-NEXT: s_waitcnt_depctr 0xffe3
20
- ; GCN-NEXT: s_nop 0
21
- ; GCN-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40
22
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
23
- ; GCN-NEXT: s_clause 0x1
24
- ; GCN-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50
25
- ; GCN-NEXT: s_nop 0
26
- ; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c
27
- ; GCN-NEXT: v_sub_f32_e64 v5, s24, s28
28
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
29
- ; GCN-NEXT: s_clause 0x4
30
- ; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60
31
- ; GCN-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20
32
- ; GCN-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0
33
- ; GCN-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70
34
- ; GCN-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10
35
- ; GCN-NEXT: v_fma_f32 v1, v1, v5, s28
36
- ; GCN-NEXT: v_max_f32_e64 v6, s0, s0 clamp
37
- ; GCN-NEXT: v_add_f32_e64 v5, s29, -1.0
38
- ; GCN-NEXT: v_sub_f32_e32 v8, s0, v1
39
- ; GCN-NEXT: v_fma_f32 v7, -s2, v6, s6
40
- ; GCN-NEXT: v_fma_f32 v5, v6, v5, 1.0
41
- ; GCN-NEXT: v_mad_f32 v10, s2, v6, v2
42
- ; GCN-NEXT: s_mov_b32 s0, 0x3c23d70a
43
- ; GCN-NEXT: v_fmac_f32_e32 v1, v6, v8
44
- ; GCN-NEXT: v_fmac_f32_e32 v10, v7, v6
45
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
46
- ; GCN-NEXT: v_mul_f32_e32 v9, s10, v0
47
- ; GCN-NEXT: v_fma_f32 v0, -v0, s10, s14
48
- ; GCN-NEXT: v_mul_f32_e32 v8, s18, v2
49
- ; GCN-NEXT: v_mul_f32_e32 v3, s22, v3
50
- ; GCN-NEXT: v_fmac_f32_e32 v9, v0, v6
51
- ; GCN-NEXT: v_sub_f32_e32 v0, v1, v5
52
- ; GCN-NEXT: v_mul_f32_e32 v1, v8, v6
53
- ; GCN-NEXT: v_mul_f32_e32 v7, v6, v3
54
- ; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9
55
- ; GCN-NEXT: v_fmac_f32_e32 v5, v0, v6
56
- ; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1
57
- ; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6
58
- ; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6
59
- ; GCN-NEXT: v_mul_f32_e32 v0, v2, v6
60
- ; GCN-NEXT: s_waitcnt vmcnt(0)
61
- ; GCN-NEXT: v_add_f32_e32 v4, v4, v10
62
- ; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
63
- ; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
64
- ; GCN-NEXT: v_mul_f32_e32 v1, v3, v1
65
- ; GCN-NEXT: v_mul_f32_e32 v2, v7, v4
66
- ; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0
67
- ; GCN-NEXT: v_max_f32_e32 v0, 0, v1
68
- ; GCN-NEXT: ; return to shader part epilog
6
+ ; GFX10-LABEL: _amdgpu_ps_main:
7
+ ; GFX10: ; %bb.0: ; %.entry
8
+ ; GFX10-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
9
+ ; GFX10-NEXT: v_mov_b32_e32 v4, 0
10
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
11
+ ; GFX10-NEXT: s_clause 0x1
12
+ ; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
13
+ ; GFX10-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
14
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
15
+ ; GFX10-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
16
+ ; GFX10-NEXT: s_clause 0x3
17
+ ; GFX10-NEXT: s_buffer_load_dword s24, s[0:3], 0x5c
18
+ ; GFX10-NEXT: s_buffer_load_dword s28, s[0:3], 0x7c
19
+ ; GFX10-NEXT: s_buffer_load_dword s29, s[0:3], 0xc0
20
+ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
21
+ ; GFX10-NEXT: s_nop 0
22
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[0:3], s[0:3], 0x40
23
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
24
+ ; GFX10-NEXT: s_clause 0x1
25
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[4:7], s[0:3], 0x50
26
+ ; GFX10-NEXT: s_nop 0
27
+ ; GFX10-NEXT: s_buffer_load_dword s0, s[0:3], 0x2c
28
+ ; GFX10-NEXT: v_sub_f32_e64 v5, s24, s28
29
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
30
+ ; GFX10-NEXT: s_clause 0x4
31
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[8:11], s[0:3], 0x60
32
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[12:15], s[0:3], 0x20
33
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[16:19], s[0:3], 0x0
34
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[20:23], s[0:3], 0x70
35
+ ; GFX10-NEXT: s_buffer_load_dwordx4 s[24:27], s[0:3], 0x10
36
+ ; GFX10-NEXT: v_fma_f32 v1, v1, v5, s28
37
+ ; GFX10-NEXT: v_max_f32_e64 v6, s0, s0 clamp
38
+ ; GFX10-NEXT: v_add_f32_e64 v5, s29, -1.0
39
+ ; GFX10-NEXT: v_sub_f32_e32 v8, s0, v1
40
+ ; GFX10-NEXT: v_fma_f32 v7, -s2, v6, s6
41
+ ; GFX10-NEXT: v_fma_f32 v5, v6, v5, 1.0
42
+ ; GFX10-NEXT: v_mad_f32 v10, s2, v6, v2
43
+ ; GFX10-NEXT: s_mov_b32 s0, 0x3c23d70a
44
+ ; GFX10-NEXT: v_fmac_f32_e32 v1, v6, v8
45
+ ; GFX10-NEXT: v_fmac_f32_e32 v10, v7, v6
46
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
47
+ ; GFX10-NEXT: v_mul_f32_e32 v9, s10, v0
48
+ ; GFX10-NEXT: v_fma_f32 v0, -v0, s10, s14
49
+ ; GFX10-NEXT: v_mul_f32_e32 v8, s18, v2
50
+ ; GFX10-NEXT: v_mul_f32_e32 v3, s22, v3
51
+ ; GFX10-NEXT: v_fmac_f32_e32 v9, v0, v6
52
+ ; GFX10-NEXT: v_sub_f32_e32 v0, v1, v5
53
+ ; GFX10-NEXT: v_mul_f32_e32 v1, v8, v6
54
+ ; GFX10-NEXT: v_mul_f32_e32 v7, v6, v3
55
+ ; GFX10-NEXT: v_fma_f32 v3, -v6, v3, v9
56
+ ; GFX10-NEXT: v_fmac_f32_e32 v5, v0, v6
57
+ ; GFX10-NEXT: v_fma_f32 v0, v2, s26, -v1
58
+ ; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v6
59
+ ; GFX10-NEXT: v_fmac_f32_e32 v1, v0, v6
60
+ ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v6
61
+ ; GFX10-NEXT: s_waitcnt vmcnt(0)
62
+ ; GFX10-NEXT: v_add_f32_e32 v4, v4, v10
63
+ ; GFX10-NEXT: v_mul_f32_e32 v3, v4, v6
64
+ ; GFX10-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a
65
+ ; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1
66
+ ; GFX10-NEXT: v_mul_f32_e32 v2, v7, v4
67
+ ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v0
68
+ ; GFX10-NEXT: v_max_f32_e32 v0, 0, v1
69
+ ; GFX10-NEXT: ; return to shader part epilog
70
+ ;
71
+ ; GFX11-LABEL: _amdgpu_ps_main:
72
+ ; GFX11: ; %bb.0: ; %.entry
73
+ ; GFX11-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
74
+ ; GFX11-NEXT: v_mov_b32_e32 v4, 0
75
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
76
+ ; GFX11-NEXT: s_clause 0x1
77
+ ; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
78
+ ; GFX11-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
79
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
80
+ ; GFX11-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
81
+ ; GFX11-NEXT: s_clause 0x3
82
+ ; GFX11-NEXT: s_buffer_load_b32 s24, s[0:3], 0x5c
83
+ ; GFX11-NEXT: s_buffer_load_b32 s28, s[0:3], 0x7c
84
+ ; GFX11-NEXT: s_buffer_load_b32 s29, s[0:3], 0xc0
85
+ ; GFX11-NEXT: s_buffer_load_b128 s[0:3], s[0:3], 0x40
86
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
87
+ ; GFX11-NEXT: s_clause 0x1
88
+ ; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x50
89
+ ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], 0x2c
90
+ ; GFX11-NEXT: v_sub_f32_e64 v5, s24, s28
91
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
92
+ ; GFX11-NEXT: s_clause 0x3
93
+ ; GFX11-NEXT: s_buffer_load_b128 s[8:11], s[0:3], 0x60
94
+ ; GFX11-NEXT: s_buffer_load_b128 s[12:15], s[0:3], 0x20
95
+ ; GFX11-NEXT: s_buffer_load_b128 s[16:19], s[0:3], 0x0
96
+ ; GFX11-NEXT: s_buffer_load_b128 s[20:23], s[0:3], 0x70
97
+ ; GFX11-NEXT: v_fma_f32 v1, v1, v5, s28
98
+ ; GFX11-NEXT: v_max_f32_e64 v6, s0, s0 clamp
99
+ ; GFX11-NEXT: s_buffer_load_b128 s[24:27], s[0:3], 0x10
100
+ ; GFX11-NEXT: v_add_f32_e64 v5, s29, -1.0
101
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
102
+ ; GFX11-NEXT: v_sub_f32_e32 v8, s0, v1
103
+ ; GFX11-NEXT: v_fma_f32 v7, -s2, v6, s6
104
+ ; GFX11-NEXT: v_fma_f32 v10, s2, v6, v2
105
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
106
+ ; GFX11-NEXT: v_fma_f32 v5, v6, v5, 1.0
107
+ ; GFX11-NEXT: s_mov_b32 s0, 0x3c23d70a
108
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
109
+ ; GFX11-NEXT: v_mul_f32_e32 v9, s10, v0
110
+ ; GFX11-NEXT: v_fma_f32 v0, -v0, s10, s14
111
+ ; GFX11-NEXT: v_mul_f32_e32 v3, s22, v3
112
+ ; GFX11-NEXT: v_dual_fmac_f32 v1, v6, v8 :: v_dual_mul_f32 v8, s18, v2
113
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
114
+ ; GFX11-NEXT: v_fmac_f32_e32 v9, v0, v6
115
+ ; GFX11-NEXT: v_dual_fmac_f32 v10, v7, v6 :: v_dual_mul_f32 v7, v6, v3
116
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
117
+ ; GFX11-NEXT: v_sub_f32_e32 v0, v1, v5
118
+ ; GFX11-NEXT: v_fma_f32 v3, -v6, v3, v9
119
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
120
+ ; GFX11-NEXT: v_fmac_f32_e32 v7, v3, v6
121
+ ; GFX11-NEXT: v_fmac_f32_e32 v5, v0, v6
122
+ ; GFX11-NEXT: v_mul_f32_e32 v1, v8, v6
123
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
124
+ ; GFX11-NEXT: v_add_f32_e32 v4, v4, v10
125
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
126
+ ; GFX11-NEXT: v_dual_mul_f32 v3, v4, v6 :: v_dual_fmaak_f32 v4, s0, v5, 0x3ca3d70a
127
+ ; GFX11-NEXT: v_fma_f32 v0, v2, s26, -v1
128
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
129
+ ; GFX11-NEXT: v_fmac_f32_e32 v1, v0, v6
130
+ ; GFX11-NEXT: v_mul_f32_e32 v0, v2, v6
131
+ ; GFX11-NEXT: v_mul_f32_e32 v2, v7, v4
132
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
133
+ ; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
134
+ ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0
135
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
136
+ ; GFX11-NEXT: v_max_f32_e32 v0, 0, v1
137
+ ; GFX11-NEXT: ; return to shader part epilog
69
138
.entry:
70
139
%0 = call <3 x float > @llvm.amdgcn.image.sample.2d.v3f32.f32 (i32 7 , float undef , float undef , <8 x i32 > undef , <4 x i32 > undef , i1 false , i32 0 , i32 0 )
71
140
%.i2243 = extractelement <3 x float > %0 , i32 2
@@ -168,13 +237,22 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
168
237
}
169
238
170
239
define float @fmac_sequence_simple (float %a , float %b , float %c , float %d , float %e ) #0 {
171
- ; GCN-LABEL: fmac_sequence_simple:
172
- ; GCN: ; %bb.0:
173
- ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
174
- ; GCN-NEXT: v_fma_f32 v2, v2, v3, v4
175
- ; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1
176
- ; GCN-NEXT: v_mov_b32_e32 v0, v2
177
- ; GCN-NEXT: s_setpc_b64 s[30:31]
240
+ ; GFX10-LABEL: fmac_sequence_simple:
241
+ ; GFX10: ; %bb.0:
242
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243
+ ; GFX10-NEXT: v_fma_f32 v2, v2, v3, v4
244
+ ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
245
+ ; GFX10-NEXT: v_mov_b32_e32 v0, v2
246
+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
247
+ ;
248
+ ; GFX11-LABEL: fmac_sequence_simple:
249
+ ; GFX11: ; %bb.0:
250
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251
+ ; GFX11-NEXT: v_fma_f32 v2, v2, v3, v4
252
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
253
+ ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
254
+ ; GFX11-NEXT: v_mov_b32_e32 v0, v2
255
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
178
256
%t0 = fmul fast float %a , %b
179
257
%t1 = fmul fast float %c , %d
180
258
%t2 = fadd fast float %t0 , %t1
@@ -183,14 +261,25 @@ define float @fmac_sequence_simple(float %a, float %b, float %c, float %d, float
183
261
}
184
262
185
263
define float @fmac_sequence_innermost_fmul (float %a , float %b , float %c , float %d , float %e , float %f , float %g ) #0 {
186
- ; GCN-LABEL: fmac_sequence_innermost_fmul:
187
- ; GCN: ; %bb.0:
188
- ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189
- ; GCN-NEXT: v_mad_f32 v2, v2, v3, v6
190
- ; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1
191
- ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v5
192
- ; GCN-NEXT: v_mov_b32_e32 v0, v2
193
- ; GCN-NEXT: s_setpc_b64 s[30:31]
264
+ ; GFX10-LABEL: fmac_sequence_innermost_fmul:
265
+ ; GFX10: ; %bb.0:
266
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267
+ ; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6
268
+ ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
269
+ ; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5
270
+ ; GFX10-NEXT: v_mov_b32_e32 v0, v2
271
+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
272
+ ;
273
+ ; GFX11-LABEL: fmac_sequence_innermost_fmul:
274
+ ; GFX11: ; %bb.0:
275
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276
+ ; GFX11-NEXT: v_fma_f32 v2, v2, v3, v6
277
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
278
+ ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
279
+ ; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v5
280
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
281
+ ; GFX11-NEXT: v_mov_b32_e32 v0, v2
282
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
194
283
%t0 = fmul fast float %a , %b
195
284
%t1 = fmul fast float %c , %d
196
285
%t2 = fadd fast float %t0 , %t1
@@ -201,14 +290,25 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %
201
290
}
202
291
203
292
define float @fmac_sequence_innermost_fmul_swapped_operands (float %a , float %b , float %c , float %d , float %e , float %f , float %g ) #0 {
204
- ; GCN-LABEL: fmac_sequence_innermost_fmul_swapped_operands:
205
- ; GCN: ; %bb.0:
206
- ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207
- ; GCN-NEXT: v_mad_f32 v2, v2, v3, v6
208
- ; GCN-NEXT: v_fmac_f32_e32 v2, v0, v1
209
- ; GCN-NEXT: v_fmac_f32_e32 v2, v4, v5
210
- ; GCN-NEXT: v_mov_b32_e32 v0, v2
211
- ; GCN-NEXT: s_setpc_b64 s[30:31]
293
+ ; GFX10-LABEL: fmac_sequence_innermost_fmul_swapped_operands:
294
+ ; GFX10: ; %bb.0:
295
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296
+ ; GFX10-NEXT: v_mad_f32 v2, v2, v3, v6
297
+ ; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
298
+ ; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v5
299
+ ; GFX10-NEXT: v_mov_b32_e32 v0, v2
300
+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
301
+ ;
302
+ ; GFX11-LABEL: fmac_sequence_innermost_fmul_swapped_operands:
303
+ ; GFX11: ; %bb.0:
304
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305
+ ; GFX11-NEXT: v_fma_f32 v2, v2, v3, v6
306
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
307
+ ; GFX11-NEXT: v_fmac_f32_e32 v2, v0, v1
308
+ ; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v5
309
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
310
+ ; GFX11-NEXT: v_mov_b32_e32 v0, v2
311
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
212
312
%t0 = fmul fast float %a , %b
213
313
%t1 = fmul fast float %c , %d
214
314
%t2 = fadd fast float %t0 , %t1
@@ -219,12 +319,20 @@ define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b,
219
319
}
220
320
221
321
define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr (float inreg %a , float inreg %b , float inreg %c , float inreg %d , float inreg %e , float inreg %f , float %g ) #0 {
222
- ; GCN-LABEL: fmac_sequence_innermost_fmul_sgpr:
223
- ; GCN: ; %bb.0:
224
- ; GCN-NEXT: v_mac_f32_e64 v0, s2, s3
225
- ; GCN-NEXT: v_fmac_f32_e64 v0, s0, s1
226
- ; GCN-NEXT: v_fmac_f32_e64 v0, s4, s5
227
- ; GCN-NEXT: ; return to shader part epilog
322
+ ; GFX10-LABEL: fmac_sequence_innermost_fmul_sgpr:
323
+ ; GFX10: ; %bb.0:
324
+ ; GFX10-NEXT: v_mac_f32_e64 v0, s2, s3
325
+ ; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1
326
+ ; GFX10-NEXT: v_fmac_f32_e64 v0, s4, s5
327
+ ; GFX10-NEXT: ; return to shader part epilog
328
+ ;
329
+ ; GFX11-LABEL: fmac_sequence_innermost_fmul_sgpr:
330
+ ; GFX11: ; %bb.0:
331
+ ; GFX11-NEXT: v_fmac_f32_e64 v0, s2, s3
332
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
333
+ ; GFX11-NEXT: v_fmac_f32_e64 v0, s0, s1
334
+ ; GFX11-NEXT: v_fmac_f32_e64 v0, s4, s5
335
+ ; GFX11-NEXT: ; return to shader part epilog
228
336
%t0 = fmul fast float %a , %b
229
337
%t1 = fmul fast float %c , %d
230
338
%t2 = fadd fast float %t0 , %t1
@@ -235,14 +343,25 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_sgpr(float inreg %a, float
235
343
}
236
344
237
345
define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use (float inreg %a , float inreg %b , float inreg %c , float inreg %d , float inreg %e , float inreg %f , float %g ) #0 {
238
- ; GCN-LABEL: fmac_sequence_innermost_fmul_multiple_use:
239
- ; GCN: ; %bb.0:
240
- ; GCN-NEXT: v_mul_f32_e64 v1, s2, s3
241
- ; GCN-NEXT: v_fmac_f32_e64 v1, s0, s1
242
- ; GCN-NEXT: v_fma_f32 v2, s5, s4, v1
243
- ; GCN-NEXT: v_fmac_f32_e32 v1, s5, v2
244
- ; GCN-NEXT: v_add_f32_e32 v0, v1, v0
245
- ; GCN-NEXT: ; return to shader part epilog
346
+ ; GFX10-LABEL: fmac_sequence_innermost_fmul_multiple_use:
347
+ ; GFX10: ; %bb.0:
348
+ ; GFX10-NEXT: v_mul_f32_e64 v1, s2, s3
349
+ ; GFX10-NEXT: v_fmac_f32_e64 v1, s0, s1
350
+ ; GFX10-NEXT: v_fma_f32 v2, s5, s4, v1
351
+ ; GFX10-NEXT: v_fmac_f32_e32 v1, s5, v2
352
+ ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
353
+ ; GFX10-NEXT: ; return to shader part epilog
354
+ ;
355
+ ; GFX11-LABEL: fmac_sequence_innermost_fmul_multiple_use:
356
+ ; GFX11: ; %bb.0:
357
+ ; GFX11-NEXT: v_mul_f32_e64 v1, s2, s3
358
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
359
+ ; GFX11-NEXT: v_fmac_f32_e64 v1, s0, s1
360
+ ; GFX11-NEXT: v_fma_f32 v2, s5, s4, v1
361
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
362
+ ; GFX11-NEXT: v_fmac_f32_e32 v1, s5, v2
363
+ ; GFX11-NEXT: v_add_f32_e32 v0, v1, v0
364
+ ; GFX11-NEXT: ; return to shader part epilog
246
365
%t0 = fmul fast float %a , %b
247
366
%t1 = fmul fast float %c , %d
248
367
%t2 = fadd fast float %t0 , %t1
@@ -258,12 +377,20 @@ define amdgpu_ps float @fmac_sequence_innermost_fmul_multiple_use(float inreg %a
258
377
; selecting it as a multiply. In some cases the multiply is better because
259
378
; SIFoldOperands can fold it into a previous instruction as an output modifier.
260
379
define amdgpu_ps float @fma_vs_output_modifier (float %x , i32 %n ) #0 {
261
- ; GCN-LABEL: fma_vs_output_modifier:
262
- ; GCN: ; %bb.0:
263
- ; GCN-NEXT: v_cvt_f32_i32_e64 v1, v1 mul:2
264
- ; GCN-NEXT: v_mul_f32_e32 v0, v0, v0
265
- ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
266
- ; GCN-NEXT: ; return to shader part epilog
380
+ ; GFX10-LABEL: fma_vs_output_modifier:
381
+ ; GFX10: ; %bb.0:
382
+ ; GFX10-NEXT: v_cvt_f32_i32_e64 v1, v1 mul:2
383
+ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v0
384
+ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
385
+ ; GFX10-NEXT: ; return to shader part epilog
386
+ ;
387
+ ; GFX11-LABEL: fma_vs_output_modifier:
388
+ ; GFX11: ; %bb.0:
389
+ ; GFX11-NEXT: v_cvt_f32_i32_e64 v1, v1 mul:2
390
+ ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0
391
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
392
+ ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1
393
+ ; GFX11-NEXT: ; return to shader part epilog
267
394
%s = sitofp i32 %n to float
268
395
%m = fmul contract float %x , %x
269
396
%a = fmul contract float %m , 2 .0
0 commit comments