1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
1
2
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
2
3
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
4
+ target triple = "nvptx64-nvidia-cuda"
3
5
4
6
5
- ; CHECK-LABEL: test_v2i8
6
- ; CHECK-DAG: ld.param.u16 [[A:%rs[0-9]+]], [test_v2i8_param_0];
7
- ; CHECK-DAG: cvt.s16.s8 [[E0:%rs[0-9]+]], [[A]];
8
- ; CHECK-DAG: shr.s16 [[E1:%rs[0-9]+]], [[A]], 8;
9
7
define i16 @test_v2i8 (i16 %a ) {
8
+ ; CHECK-LABEL: test_v2i8(
9
+ ; CHECK: {
10
+ ; CHECK-NEXT: .reg .b16 %rs<5>;
11
+ ; CHECK-NEXT: .reg .b32 %r<2>;
12
+ ; CHECK-EMPTY:
13
+ ; CHECK-NEXT: // %bb.0:
14
+ ; CHECK-NEXT: ld.param.u16 %rs1, [test_v2i8_param_0];
15
+ ; CHECK-NEXT: cvt.s16.s8 %rs2, %rs1;
16
+ ; CHECK-NEXT: shr.s16 %rs3, %rs1, 8;
17
+ ; CHECK-NEXT: add.s16 %rs4, %rs2, %rs3;
18
+ ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
19
+ ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
20
+ ; CHECK-NEXT: ret;
10
21
%v = bitcast i16 %a to <2 x i8 >
11
22
%r0 = extractelement <2 x i8 > %v , i64 0
12
23
%r1 = extractelement <2 x i8 > %v , i64 1
@@ -16,17 +27,53 @@ define i16 @test_v2i8(i16 %a) {
16
27
ret i16 %r01
17
28
}
18
29
19
- ; CHECK-LABEL: test_v4i8
20
- ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0];
21
- ; CHECK-DAG: bfe.s32 [[R0:%r[0-9]+]], [[R]], 0, 8;
22
- ; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9]+]], [[R0]];
23
- ; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R]], 8, 8;
24
- ; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9]+]], [[R1]];
25
- ; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R]], 16, 8;
26
- ; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9]+]], [[R2]];
27
- ; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R]], 24, 8;
28
- ; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9]+]], [[R3]];
30
+ define i1 @test_v2i8_load (ptr %a ) {
31
+ ; CHECK-LABEL: test_v2i8_load(
32
+ ; CHECK: {
33
+ ; CHECK-NEXT: .reg .pred %p<2>;
34
+ ; CHECK-NEXT: .reg .b16 %rs<7>;
35
+ ; CHECK-NEXT: .reg .b32 %r<2>;
36
+ ; CHECK-NEXT: .reg .b64 %rd<2>;
37
+ ; CHECK-EMPTY:
38
+ ; CHECK-NEXT: // %bb.0:
39
+ ; CHECK-NEXT: ld.param.u64 %rd1, [test_v2i8_load_param_0];
40
+ ; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1];
41
+ ; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2;
42
+ ; CHECK-NEXT: and.b16 %rs6, %rs5, 255;
43
+ ; CHECK-NEXT: setp.eq.s16 %p1, %rs6, 0;
44
+ ; CHECK-NEXT: selp.u32 %r1, 1, 0, %p1;
45
+ ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
46
+ ; CHECK-NEXT: ret;
47
+ %v = load <2 x i8 >, ptr %a , align 4
48
+ %r0 = extractelement <2 x i8 > %v , i64 0
49
+ %r1 = extractelement <2 x i8 > %v , i64 1
50
+ %icmp = icmp eq i8 %r0 , 0
51
+ %icmp3 = icmp eq i8 %r1 , 0
52
+ %select = select i1 %icmp , i1 %icmp3 , i1 false
53
+ ret i1 %select
54
+ }
29
55
define i16 @test_v4i8 (i32 %a ) {
56
+ ; CHECK-LABEL: test_v4i8(
57
+ ; CHECK: {
58
+ ; CHECK-NEXT: .reg .b16 %rs<8>;
59
+ ; CHECK-NEXT: .reg .b32 %r<7>;
60
+ ; CHECK-EMPTY:
61
+ ; CHECK-NEXT: // %bb.0:
62
+ ; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_param_0];
63
+ ; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
64
+ ; CHECK-NEXT: cvt.s8.s32 %rs1, %r2;
65
+ ; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
66
+ ; CHECK-NEXT: cvt.s8.s32 %rs2, %r3;
67
+ ; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
68
+ ; CHECK-NEXT: cvt.s8.s32 %rs3, %r4;
69
+ ; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
70
+ ; CHECK-NEXT: cvt.s8.s32 %rs4, %r5;
71
+ ; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2;
72
+ ; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4;
73
+ ; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
74
+ ; CHECK-NEXT: cvt.u32.u16 %r6, %rs7;
75
+ ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6;
76
+ ; CHECK-NEXT: ret;
30
77
%v = bitcast i32 %a to <4 x i8 >
31
78
%r0 = extractelement <4 x i8 > %v , i64 0
32
79
%r1 = extractelement <4 x i8 > %v , i64 1
@@ -42,16 +89,22 @@ define i16 @test_v4i8(i32 %a) {
42
89
ret i16 %r
43
90
}
44
91
45
- ; CHECK-LABEL: test_v4i8_s32
46
- ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_s32_param_0];
47
- ; CHECK-DAG: bfe.s32 [[R0:%r[0-9]+]], [[R]], 0, 8;
48
- ; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R]], 8, 8;
49
- ; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R]], 16, 8;
50
- ; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R]], 24, 8;
51
- ; CHECK-DAG: add.s32 [[R01:%r[0-9]+]], [[R0]], [[R1]]
52
- ; CHECK-DAG: add.s32 [[R23:%r[0-9]+]], [[R2]], [[R3]]
53
- ; CHECK-DAG: add.s32 [[R0123:%r[0-9]+]], [[R01]], [[R23]]
54
92
define i32 @test_v4i8_s32 (i32 %a ) {
93
+ ; CHECK-LABEL: test_v4i8_s32(
94
+ ; CHECK: {
95
+ ; CHECK-NEXT: .reg .b32 %r<9>;
96
+ ; CHECK-EMPTY:
97
+ ; CHECK-NEXT: // %bb.0:
98
+ ; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_s32_param_0];
99
+ ; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
100
+ ; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
101
+ ; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
102
+ ; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
103
+ ; CHECK-NEXT: add.s32 %r6, %r2, %r3;
104
+ ; CHECK-NEXT: add.s32 %r7, %r4, %r5;
105
+ ; CHECK-NEXT: add.s32 %r8, %r6, %r7;
106
+ ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
107
+ ; CHECK-NEXT: ret;
55
108
%v = bitcast i32 %a to <4 x i8 >
56
109
%r0 = extractelement <4 x i8 > %v , i64 0
57
110
%r1 = extractelement <4 x i8 > %v , i64 1
@@ -67,16 +120,22 @@ define i32 @test_v4i8_s32(i32 %a) {
67
120
ret i32 %r
68
121
}
69
122
70
- ; CHECK-LABEL: test_v4i8_u32
71
- ; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_u32_param_0];
72
- ; CHECK-DAG: bfe.u32 [[R0:%r[0-9]+]], [[R]], 0, 8;
73
- ; CHECK-DAG: bfe.u32 [[R1:%r[0-9]+]], [[R]], 8, 8;
74
- ; CHECK-DAG: bfe.u32 [[R2:%r[0-9]+]], [[R]], 16, 8;
75
- ; CHECK-DAG: bfe.u32 [[R3:%r[0-9]+]], [[R]], 24, 8;
76
- ; CHECK-DAG: add.s32 [[R01:%r[0-9]+]], [[R0]], [[R1]]
77
- ; CHECK-DAG: add.s32 [[R23:%r[0-9]+]], [[R2]], [[R3]]
78
- ; CHECK-DAG: add.s32 [[R0123:%r[0-9]+]], [[R01]], [[R23]]
79
123
define i32 @test_v4i8_u32 (i32 %a ) {
124
+ ; CHECK-LABEL: test_v4i8_u32(
125
+ ; CHECK: {
126
+ ; CHECK-NEXT: .reg .b32 %r<9>;
127
+ ; CHECK-EMPTY:
128
+ ; CHECK-NEXT: // %bb.0:
129
+ ; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_u32_param_0];
130
+ ; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
131
+ ; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8;
132
+ ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
133
+ ; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8;
134
+ ; CHECK-NEXT: add.s32 %r6, %r2, %r3;
135
+ ; CHECK-NEXT: add.s32 %r7, %r4, %r5;
136
+ ; CHECK-NEXT: add.s32 %r8, %r6, %r7;
137
+ ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
138
+ ; CHECK-NEXT: ret;
80
139
%v = bitcast i32 %a to <4 x i8 >
81
140
%r0 = extractelement <4 x i8 > %v , i64 0
82
141
%r1 = extractelement <4 x i8 > %v , i64 1
@@ -94,28 +153,43 @@ define i32 @test_v4i8_u32(i32 %a) {
94
153
95
154
96
155
97
- ; CHECK-LABEL: test_v8i8
98
- ; CHECK: ld.param.u64 [[R:%rd[0-9]+]], [test_v8i8_param_0];
99
- ; CHECK-DAG: cvt.u32.u64 [[R00:%r[0-9]+]], [[R]];
100
- ; CHECK-DAG: { .reg .b32 tmp; mov.b64 {tmp, [[R01:%r[0-9]+]]}, [[R]]; }
101
- ; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R00]], 0, 8;
102
- ; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9]+]], [[R1]];
103
- ; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R00]], 8, 8;
104
- ; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9]+]], [[R2]];
105
- ; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R00]], 16, 8;
106
- ; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9]+]], [[R3]];
107
- ; CHECK-DAG: bfe.s32 [[R4:%r[0-9]+]], [[R00]], 24, 8;
108
- ; CHECK-DAG: cvt.s8.s32 [[E4:%rs[0-9]+]], [[R4]];
109
- ; CHECK-DAG: bfe.s32 [[R5:%r[0-9]+]], [[R01]], 0, 8;
110
- ; CHECK-DAG: cvt.s8.s32 [[E5:%rs[0-9]+]], [[R5]];
111
- ; CHECK-DAG: bfe.s32 [[R6:%r[0-9]+]], [[R01]], 8, 8;
112
- ; CHECK-DAG: cvt.s8.s32 [[E6:%rs[0-9]+]], [[R6]];
113
- ; CHECK-DAG: bfe.s32 [[R7:%r[0-9]+]], [[R01]], 16, 8;
114
- ; CHECK-DAG: cvt.s8.s32 [[E7:%rs[0-9]+]], [[R7]];
115
- ; CHECK-DAG: bfe.s32 [[R8:%r[0-9]+]], [[R01]], 24, 8;
116
- ; CHECK-DAG: cvt.s8.s32 [[E8:%rs[0-9]+]], [[R8]];
117
-
118
156
define i16 @test_v8i8 (i64 %a ) {
157
+ ; CHECK-LABEL: test_v8i8(
158
+ ; CHECK: {
159
+ ; CHECK-NEXT: .reg .b16 %rs<16>;
160
+ ; CHECK-NEXT: .reg .b32 %r<14>;
161
+ ; CHECK-NEXT: .reg .b64 %rd<2>;
162
+ ; CHECK-EMPTY:
163
+ ; CHECK-NEXT: // %bb.0:
164
+ ; CHECK-NEXT: ld.param.u64 %rd1, [test_v8i8_param_0];
165
+ ; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
166
+ ; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
167
+ ; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8;
168
+ ; CHECK-NEXT: cvt.s8.s32 %rs1, %r5;
169
+ ; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8;
170
+ ; CHECK-NEXT: cvt.s8.s32 %rs2, %r6;
171
+ ; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
172
+ ; CHECK-NEXT: cvt.s8.s32 %rs3, %r7;
173
+ ; CHECK-NEXT: bfe.s32 %r8, %r1, 24, 8;
174
+ ; CHECK-NEXT: cvt.s8.s32 %rs4, %r8;
175
+ ; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
176
+ ; CHECK-NEXT: cvt.s8.s32 %rs5, %r9;
177
+ ; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8;
178
+ ; CHECK-NEXT: cvt.s8.s32 %rs6, %r10;
179
+ ; CHECK-NEXT: bfe.s32 %r11, %r2, 16, 8;
180
+ ; CHECK-NEXT: cvt.s8.s32 %rs7, %r11;
181
+ ; CHECK-NEXT: bfe.s32 %r12, %r2, 24, 8;
182
+ ; CHECK-NEXT: cvt.s8.s32 %rs8, %r12;
183
+ ; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
184
+ ; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4;
185
+ ; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
186
+ ; CHECK-NEXT: add.s16 %rs12, %rs7, %rs8;
187
+ ; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10;
188
+ ; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12;
189
+ ; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14;
190
+ ; CHECK-NEXT: cvt.u32.u16 %r13, %rs15;
191
+ ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13;
192
+ ; CHECK-NEXT: ret;
119
193
%v = bitcast i64 %a to <8 x i8 >
120
194
%r0 = extractelement <8 x i8 > %v , i64 0
121
195
%r1 = extractelement <8 x i8 > %v , i64 1
0 commit comments