@@ -2024,4 +2024,217 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
2024
2024
ret <16 x i32 > %result
2025
2025
}
2026
2026
2027
+ ; --------------------------------------------------------------------
2028
+ ; llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8
2029
+ ; --------------------------------------------------------------------
2030
+
2031
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2032
+
2033
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2034
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
2035
+ ; SDAG: ; %bb.0: ; %bb
2036
+ ; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2037
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2038
+ ; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2039
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2040
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2041
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2042
+ ; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
2043
+ ; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2044
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s4
2045
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s5
2046
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s6
2047
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s7
2048
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s8
2049
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s9
2050
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s10
2051
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s11
2052
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2053
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s12
2054
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s13
2055
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s14
2056
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s15
2057
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2058
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2059
+ ; SDAG-NEXT: s_nop 0
2060
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2061
+ ; SDAG-NEXT: s_nop 6
2062
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
2063
+ ; SDAG-NEXT: s_endpgm
2064
+ ;
2065
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
2066
+ ; GISEL: ; %bb.0: ; %bb
2067
+ ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2068
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2069
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2070
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
2071
+ ; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
2072
+ ; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
2073
+ ; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
2074
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2075
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[6:7]
2076
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
2077
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
2078
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
2079
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
2080
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
2081
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2082
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2083
+ ; GISEL-NEXT: s_nop 0
2084
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2085
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2086
+ ; GISEL-NEXT: s_nop 5
2087
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
2088
+ ; GISEL-NEXT: s_endpgm
2089
+ bb:
2090
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2091
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2092
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2093
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2094
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2095
+ ret void
2096
+ }
2097
+
2098
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2099
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
2100
+ ; SDAG: ; %bb.0:
2101
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2102
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2103
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2104
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2105
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2106
+ ; SDAG-NEXT: s_nop 1
2107
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
2108
+ ; SDAG-NEXT: s_nop 6
2109
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2110
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2111
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2112
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2113
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2114
+ ;
2115
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
2116
+ ; GISEL: ; %bb.0:
2117
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2118
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
2119
+ ; GISEL-NEXT: s_nop 6
2120
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2121
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2122
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2123
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2124
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2125
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2126
+ ret <4 x float > %result
2127
+ }
2128
+
2129
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2130
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
2131
+ ; SDAG: ; %bb.0:
2132
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2133
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2134
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2135
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2136
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2137
+ ; SDAG-NEXT: s_nop 1
2138
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2139
+ ; SDAG-NEXT: s_nop 6
2140
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2141
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2142
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2143
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2144
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2145
+ ;
2146
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
2147
+ ; GISEL: ; %bb.0:
2148
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2149
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2150
+ ; GISEL-NEXT: s_nop 6
2151
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2152
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2153
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2154
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2155
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2156
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2157
+ ret <4 x float > %result
2158
+ }
2159
+
2160
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2161
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
2162
+ ; SDAG: ; %bb.0:
2163
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2164
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2165
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2166
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2167
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2168
+ ; SDAG-NEXT: s_nop 1
2169
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2170
+ ; SDAG-NEXT: s_nop 6
2171
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2172
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2173
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2174
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2175
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2176
+ ;
2177
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
2178
+ ; GISEL: ; %bb.0:
2179
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2180
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2181
+ ; GISEL-NEXT: s_nop 6
2182
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2183
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2184
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2185
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2186
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2187
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2188
+ ret <4 x float > %result
2189
+ }
2190
+
2191
+ define <4 x float > @test_smfmac_f32_16x16x128_bf8_bf8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2192
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
2193
+ ; SDAG: ; %bb.0:
2194
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2195
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2196
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2197
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2198
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2199
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s4
2200
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s5
2201
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s6
2202
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s7
2203
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s8
2204
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s9
2205
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s10
2206
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s11
2207
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
2208
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
2209
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
2210
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
2211
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s16
2212
+ ; SDAG-NEXT: s_nop 1
2213
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[8:11], v[0:7], v12
2214
+ ; SDAG-NEXT: s_nop 6
2215
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2216
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2217
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2218
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2219
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2220
+ ;
2221
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
2222
+ ; GISEL: ; %bb.0:
2223
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2224
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2225
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2226
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
2227
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2228
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
2229
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
2230
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
2231
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2232
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s16
2233
+ ; GISEL-NEXT: s_nop 1
2234
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
2235
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2236
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2237
+ ret <4 x float > %result
2238
+ }
2239
+
2027
2240
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
0 commit comments