Closed
Description
With SLP Vectorizer, a hot loop with 6 xors + 2 fshl get reduced to 3 xors + 1 fshl. We vectorize with a VF of 2.
The SLP cost model gives it a cost of -8.
This is the loop in question:
%iv = phi i64 [ %add323, %vectorized_slp_bb ], [ 16, %bb2 ]
%add288 = add nsw i64 %iv, -3
%getelementptr289 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add288
%load290 = load i32, ptr addrspace(1) %getelementptr289, align 4, !tbaa !28, !noundef !3
%add291 = add nsw i64 %iv, -8
%getelementptr292 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add291
%load293 = load i32, ptr addrspace(1) %getelementptr292, align 4, !tbaa !28, !noundef !3
%xor294 = xor i32 %load293, %load290
%add295 = add nsw i64 %iv, -14
%getelementptr296 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add295
%load297 = load i32, ptr addrspace(1) %getelementptr296, align 4, !tbaa !28, !noundef !3
%xor298 = xor i32 %xor294, %load297
%add299 = add nsw i64 %iv, -16
%getelementptr300 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add299
%load301 = load i32, ptr addrspace(1) %getelementptr300, align 4, !tbaa !28, !noundef !3
%xor302 = xor i32 %xor298, %load301
%call303 = call i32 @llvm.fshl.i32(i32 %xor302, i32 %xor302, i32 1) #5
%getelementptr304 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %iv
store i32 %call303, ptr addrspace(1) %getelementptr304, align 4, !tbaa !28
%add305 = add nuw nsw i64 %iv, 1
%add306 = add nsw i64 %iv, -2
%getelementptr307 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add306
%load308 = load i32, ptr addrspace(1) %getelementptr307, align 4, !tbaa !28, !noundef !3
%add309 = add nsw i64 %iv, -7
%getelementptr310 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add309
%load311 = load i32, ptr addrspace(1) %getelementptr310, align 4, !tbaa !28, !noundef !3
%xor312 = xor i32 %load311, %load308
%add313 = add nsw i64 %iv, -13
%getelementptr314 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add313
%load315 = load i32, ptr addrspace(1) %getelementptr314, align 4, !tbaa !28, !noundef !3
%xor316 = xor i32 %xor312, %load315
%add317 = add nsw i64 %iv, -15
%getelementptr318 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add317
%load319 = load i32, ptr addrspace(1) %getelementptr318, align 4, !tbaa !28, !noundef !3
%xor320 = xor i32 %xor316, %load319
%call321 = call i32 @llvm.fshl.i32(i32 %xor320, i32 %xor320, i32 1) #5
%getelementptr322 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add305
store i32 %call321, ptr addrspace(1) %getelementptr322, align 4, !tbaa !28
%add323 = add nuw nsw i64 %iv, 2
%icmp324 = icmp ugt i64 %add305, 78
br i1 %icmp324, label %6, label %vectorized_slp_bb
When we vectorize it, we get:
vectorized_slp_bb: ; preds = %vectorized_slp_bb, %bb2
%iv = phi i64 [ %add323, %vectorized_slp_bb ], [ 16, %bb2 ]
%add288 = add nsw i64 %iv, -3
%getelementptr289 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add288
%add291 = add nsw i64 %iv, -8
%getelementptr292 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add291
%add295 = add nsw i64 %iv, -14
%getelementptr296 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add295
%add299 = add nsw i64 %iv, -16
%getelementptr300 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %add299
%getelementptr304 = getelementptr inbounds i32, ptr addrspace(1) %getelementptr4, i64 %iv
%add305 = add nuw nsw i64 %iv, 1
%25 = load <2 x i32>, ptr addrspace(1) %getelementptr289, align 4, !tbaa !28
%26 = load <2 x i32>, ptr addrspace(1) %getelementptr292, align 4, !tbaa !28
%27 = xor <2 x i32> %26, %25
%28 = load <2 x i32>, ptr addrspace(1) %getelementptr296, align 4, !tbaa !28
%29 = xor <2 x i32> %27, %28
%30 = load <2 x i32>, ptr addrspace(1) %getelementptr300, align 4, !tbaa !28
%31 = xor <2 x i32> %29, %30
%32 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 1, i32 1>)
store <2 x i32> %32, ptr addrspace(1) %getelementptr304, align 4, !tbaa !28
%add323 = add nuw nsw i64 %iv, 2
%icmp324 = icmp ugt i64 %add305, 78
br i1 %icmp324, label %6, label %vectorized_slp_bb
We see about a 40% degradation on benchmark that optimizes this hot loop.
The assembly for this loop shows we use 3 xor instead of vpxor and the fshl lowering using xmm registers:
movq -12(%rax,%rcx,4), %rdx
xorq 8(%rax,%rcx,4), %rdx
xorq -36(%rax,%rcx,4), %rdx
xorq -44(%rax,%rcx,4), %rdx
vmovq %rdx, %xmm0
vpsrld $31, %xmm0, %xmm1
vpaddd %xmm0, %xmm0, %xmm0
vpor %xmm1, %xmm0, %xmm0
vmovq %xmm0, 20(%rax,%rcx,4)
addq $2, %rcx
cmpq $78, %rcx
While looking at cost model for X86 arithmetic instructions, I do not see anything for v2i32 for XOR. Should we actually vectorize this loop?
Will attach the IR reproducer and -slp-threshold=2 shows we only vectorize this tree and still see the 40% degradation.