Skip to content

Commit fe7d284

Browse files
authored
[NVPTX] unbreak extract_elt lowering (llvm#102688)
LLVM has started using `freeze` instruction, and that unintentionally broke the lowering of some vector operations in NVPTX.
1 parent 4afa2de commit fe7d284

File tree

2 files changed

+129
-53
lines changed

2 files changed

+129
-53
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5920,6 +5920,8 @@ static SDValue PerformSETCCCombine(SDNode *N,
59205920
static SDValue PerformEXTRACTCombine(SDNode *N,
59215921
TargetLowering::DAGCombinerInfo &DCI) {
59225922
SDValue Vector = N->getOperand(0);
5923+
if (Vector->getOpcode() == ISD::FREEZE)
5924+
Vector = Vector->getOperand(0);
59235925
SDLoc DL(N);
59245926
EVT VectorVT = Vector.getValueType();
59255927
if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&

llvm/test/CodeGen/NVPTX/extractelement.ll

Lines changed: 127 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,23 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
23
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
4+
target triple = "nvptx64-nvidia-cuda"
35

46

5-
; CHECK-LABEL: test_v2i8
6-
; CHECK-DAG: ld.param.u16 [[A:%rs[0-9]+]], [test_v2i8_param_0];
7-
; CHECK-DAG: cvt.s16.s8 [[E0:%rs[0-9]+]], [[A]];
8-
; CHECK-DAG: shr.s16 [[E1:%rs[0-9]+]], [[A]], 8;
97
define i16 @test_v2i8(i16 %a) {
8+
; CHECK-LABEL: test_v2i8(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b16 %rs<5>;
11+
; CHECK-NEXT: .reg .b32 %r<2>;
12+
; CHECK-EMPTY:
13+
; CHECK-NEXT: // %bb.0:
14+
; CHECK-NEXT: ld.param.u16 %rs1, [test_v2i8_param_0];
15+
; CHECK-NEXT: cvt.s16.s8 %rs2, %rs1;
16+
; CHECK-NEXT: shr.s16 %rs3, %rs1, 8;
17+
; CHECK-NEXT: add.s16 %rs4, %rs2, %rs3;
18+
; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
19+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
20+
; CHECK-NEXT: ret;
1021
%v = bitcast i16 %a to <2 x i8>
1122
%r0 = extractelement <2 x i8> %v, i64 0
1223
%r1 = extractelement <2 x i8> %v, i64 1
@@ -16,17 +27,53 @@ define i16 @test_v2i8(i16 %a) {
1627
ret i16 %r01
1728
}
1829

19-
; CHECK-LABEL: test_v4i8
20-
; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0];
21-
; CHECK-DAG: bfe.s32 [[R0:%r[0-9]+]], [[R]], 0, 8;
22-
; CHECK-DAG: cvt.s8.s32 [[E0:%rs[0-9]+]], [[R0]];
23-
; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R]], 8, 8;
24-
; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9]+]], [[R1]];
25-
; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R]], 16, 8;
26-
; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9]+]], [[R2]];
27-
; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R]], 24, 8;
28-
; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9]+]], [[R3]];
30+
define i1 @test_v2i8_load(ptr %a) {
31+
; CHECK-LABEL: test_v2i8_load(
32+
; CHECK: {
33+
; CHECK-NEXT: .reg .pred %p<2>;
34+
; CHECK-NEXT: .reg .b16 %rs<7>;
35+
; CHECK-NEXT: .reg .b32 %r<2>;
36+
; CHECK-NEXT: .reg .b64 %rd<2>;
37+
; CHECK-EMPTY:
38+
; CHECK-NEXT: // %bb.0:
39+
; CHECK-NEXT: ld.param.u64 %rd1, [test_v2i8_load_param_0];
40+
; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1];
41+
; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2;
42+
; CHECK-NEXT: and.b16 %rs6, %rs5, 255;
43+
; CHECK-NEXT: setp.eq.s16 %p1, %rs6, 0;
44+
; CHECK-NEXT: selp.u32 %r1, 1, 0, %p1;
45+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
46+
; CHECK-NEXT: ret;
47+
%v = load <2 x i8>, ptr %a, align 4
48+
%r0 = extractelement <2 x i8> %v, i64 0
49+
%r1 = extractelement <2 x i8> %v, i64 1
50+
%icmp = icmp eq i8 %r0, 0
51+
%icmp3 = icmp eq i8 %r1, 0
52+
%select = select i1 %icmp, i1 %icmp3, i1 false
53+
ret i1 %select
54+
}
2955
define i16 @test_v4i8(i32 %a) {
56+
; CHECK-LABEL: test_v4i8(
57+
; CHECK: {
58+
; CHECK-NEXT: .reg .b16 %rs<8>;
59+
; CHECK-NEXT: .reg .b32 %r<7>;
60+
; CHECK-EMPTY:
61+
; CHECK-NEXT: // %bb.0:
62+
; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_param_0];
63+
; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
64+
; CHECK-NEXT: cvt.s8.s32 %rs1, %r2;
65+
; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
66+
; CHECK-NEXT: cvt.s8.s32 %rs2, %r3;
67+
; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
68+
; CHECK-NEXT: cvt.s8.s32 %rs3, %r4;
69+
; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
70+
; CHECK-NEXT: cvt.s8.s32 %rs4, %r5;
71+
; CHECK-NEXT: add.s16 %rs5, %rs1, %rs2;
72+
; CHECK-NEXT: add.s16 %rs6, %rs3, %rs4;
73+
; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6;
74+
; CHECK-NEXT: cvt.u32.u16 %r6, %rs7;
75+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r6;
76+
; CHECK-NEXT: ret;
3077
%v = bitcast i32 %a to <4 x i8>
3178
%r0 = extractelement <4 x i8> %v, i64 0
3279
%r1 = extractelement <4 x i8> %v, i64 1
@@ -42,16 +89,22 @@ define i16 @test_v4i8(i32 %a) {
4289
ret i16 %r
4390
}
4491

45-
; CHECK-LABEL: test_v4i8_s32
46-
; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_s32_param_0];
47-
; CHECK-DAG: bfe.s32 [[R0:%r[0-9]+]], [[R]], 0, 8;
48-
; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R]], 8, 8;
49-
; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R]], 16, 8;
50-
; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R]], 24, 8;
51-
; CHECK-DAG: add.s32 [[R01:%r[0-9]+]], [[R0]], [[R1]]
52-
; CHECK-DAG: add.s32 [[R23:%r[0-9]+]], [[R2]], [[R3]]
53-
; CHECK-DAG: add.s32 [[R0123:%r[0-9]+]], [[R01]], [[R23]]
5492
define i32 @test_v4i8_s32(i32 %a) {
93+
; CHECK-LABEL: test_v4i8_s32(
94+
; CHECK: {
95+
; CHECK-NEXT: .reg .b32 %r<9>;
96+
; CHECK-EMPTY:
97+
; CHECK-NEXT: // %bb.0:
98+
; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_s32_param_0];
99+
; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
100+
; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
101+
; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
102+
; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
103+
; CHECK-NEXT: add.s32 %r6, %r2, %r3;
104+
; CHECK-NEXT: add.s32 %r7, %r4, %r5;
105+
; CHECK-NEXT: add.s32 %r8, %r6, %r7;
106+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
107+
; CHECK-NEXT: ret;
55108
%v = bitcast i32 %a to <4 x i8>
56109
%r0 = extractelement <4 x i8> %v, i64 0
57110
%r1 = extractelement <4 x i8> %v, i64 1
@@ -67,16 +120,22 @@ define i32 @test_v4i8_s32(i32 %a) {
67120
ret i32 %r
68121
}
69122

70-
; CHECK-LABEL: test_v4i8_u32
71-
; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_u32_param_0];
72-
; CHECK-DAG: bfe.u32 [[R0:%r[0-9]+]], [[R]], 0, 8;
73-
; CHECK-DAG: bfe.u32 [[R1:%r[0-9]+]], [[R]], 8, 8;
74-
; CHECK-DAG: bfe.u32 [[R2:%r[0-9]+]], [[R]], 16, 8;
75-
; CHECK-DAG: bfe.u32 [[R3:%r[0-9]+]], [[R]], 24, 8;
76-
; CHECK-DAG: add.s32 [[R01:%r[0-9]+]], [[R0]], [[R1]]
77-
; CHECK-DAG: add.s32 [[R23:%r[0-9]+]], [[R2]], [[R3]]
78-
; CHECK-DAG: add.s32 [[R0123:%r[0-9]+]], [[R01]], [[R23]]
79123
define i32 @test_v4i8_u32(i32 %a) {
124+
; CHECK-LABEL: test_v4i8_u32(
125+
; CHECK: {
126+
; CHECK-NEXT: .reg .b32 %r<9>;
127+
; CHECK-EMPTY:
128+
; CHECK-NEXT: // %bb.0:
129+
; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_u32_param_0];
130+
; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
131+
; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8;
132+
; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
133+
; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8;
134+
; CHECK-NEXT: add.s32 %r6, %r2, %r3;
135+
; CHECK-NEXT: add.s32 %r7, %r4, %r5;
136+
; CHECK-NEXT: add.s32 %r8, %r6, %r7;
137+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
138+
; CHECK-NEXT: ret;
80139
%v = bitcast i32 %a to <4 x i8>
81140
%r0 = extractelement <4 x i8> %v, i64 0
82141
%r1 = extractelement <4 x i8> %v, i64 1
@@ -94,28 +153,43 @@ define i32 @test_v4i8_u32(i32 %a) {
94153

95154

96155

97-
; CHECK-LABEL: test_v8i8
98-
; CHECK: ld.param.u64 [[R:%rd[0-9]+]], [test_v8i8_param_0];
99-
; CHECK-DAG: cvt.u32.u64 [[R00:%r[0-9]+]], [[R]];
100-
; CHECK-DAG: { .reg .b32 tmp; mov.b64 {tmp, [[R01:%r[0-9]+]]}, [[R]]; }
101-
; CHECK-DAG: bfe.s32 [[R1:%r[0-9]+]], [[R00]], 0, 8;
102-
; CHECK-DAG: cvt.s8.s32 [[E1:%rs[0-9]+]], [[R1]];
103-
; CHECK-DAG: bfe.s32 [[R2:%r[0-9]+]], [[R00]], 8, 8;
104-
; CHECK-DAG: cvt.s8.s32 [[E2:%rs[0-9]+]], [[R2]];
105-
; CHECK-DAG: bfe.s32 [[R3:%r[0-9]+]], [[R00]], 16, 8;
106-
; CHECK-DAG: cvt.s8.s32 [[E3:%rs[0-9]+]], [[R3]];
107-
; CHECK-DAG: bfe.s32 [[R4:%r[0-9]+]], [[R00]], 24, 8;
108-
; CHECK-DAG: cvt.s8.s32 [[E4:%rs[0-9]+]], [[R4]];
109-
; CHECK-DAG: bfe.s32 [[R5:%r[0-9]+]], [[R01]], 0, 8;
110-
; CHECK-DAG: cvt.s8.s32 [[E5:%rs[0-9]+]], [[R5]];
111-
; CHECK-DAG: bfe.s32 [[R6:%r[0-9]+]], [[R01]], 8, 8;
112-
; CHECK-DAG: cvt.s8.s32 [[E6:%rs[0-9]+]], [[R6]];
113-
; CHECK-DAG: bfe.s32 [[R7:%r[0-9]+]], [[R01]], 16, 8;
114-
; CHECK-DAG: cvt.s8.s32 [[E7:%rs[0-9]+]], [[R7]];
115-
; CHECK-DAG: bfe.s32 [[R8:%r[0-9]+]], [[R01]], 24, 8;
116-
; CHECK-DAG: cvt.s8.s32 [[E8:%rs[0-9]+]], [[R8]];
117-
118156
define i16 @test_v8i8(i64 %a) {
157+
; CHECK-LABEL: test_v8i8(
158+
; CHECK: {
159+
; CHECK-NEXT: .reg .b16 %rs<16>;
160+
; CHECK-NEXT: .reg .b32 %r<14>;
161+
; CHECK-NEXT: .reg .b64 %rd<2>;
162+
; CHECK-EMPTY:
163+
; CHECK-NEXT: // %bb.0:
164+
; CHECK-NEXT: ld.param.u64 %rd1, [test_v8i8_param_0];
165+
; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
166+
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
167+
; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8;
168+
; CHECK-NEXT: cvt.s8.s32 %rs1, %r5;
169+
; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8;
170+
; CHECK-NEXT: cvt.s8.s32 %rs2, %r6;
171+
; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
172+
; CHECK-NEXT: cvt.s8.s32 %rs3, %r7;
173+
; CHECK-NEXT: bfe.s32 %r8, %r1, 24, 8;
174+
; CHECK-NEXT: cvt.s8.s32 %rs4, %r8;
175+
; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
176+
; CHECK-NEXT: cvt.s8.s32 %rs5, %r9;
177+
; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8;
178+
; CHECK-NEXT: cvt.s8.s32 %rs6, %r10;
179+
; CHECK-NEXT: bfe.s32 %r11, %r2, 16, 8;
180+
; CHECK-NEXT: cvt.s8.s32 %rs7, %r11;
181+
; CHECK-NEXT: bfe.s32 %r12, %r2, 24, 8;
182+
; CHECK-NEXT: cvt.s8.s32 %rs8, %r12;
183+
; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
184+
; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4;
185+
; CHECK-NEXT: add.s16 %rs11, %rs5, %rs6;
186+
; CHECK-NEXT: add.s16 %rs12, %rs7, %rs8;
187+
; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10;
188+
; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12;
189+
; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14;
190+
; CHECK-NEXT: cvt.u32.u16 %r13, %rs15;
191+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13;
192+
; CHECK-NEXT: ret;
119193
%v = bitcast i64 %a to <8 x i8>
120194
%r0 = extractelement <8 x i8> %v, i64 0
121195
%r1 = extractelement <8 x i8> %v, i64 1

0 commit comments

Comments
 (0)