Skip to content

[RISCV] Unprofitable select vectorization/lowering #109466

Closed
@preames

Description

@preames

This was brought up in discussion on #108419. This is the root cause of the reported regression on leela from spec2017 in the LTO configuration.

We are failing to recognize shifts disguised as selects in at least two contexts:

  1. During vector lowering, as shown in test_vec4. In this case, the vector select is a disguised vector shift of the mask vector extended to the working type. Note that the shift amounts are not constant per lane.
  2. During SLP vectorization, as shown in test_scalarized. If passed to SLP, we produce the form in test_vec4.
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=riscv64 -mattr=+v,+zba,+zbb < %s | FileCheck %s

define i32 @test_vec4(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
; CHECK-LABEL: test_vec4:
; CHECK:       # %bb.0:
; CHECK-NEXT:    slli a2, a2, 32
; CHECK-NEXT:    slli a3, a3, 48
; CHECK-NEXT:    or a2, a3, a2
; CHECK-NEXT:    slli a1, a1, 16
; CHECK-NEXT:    or a0, a0, a1
; CHECK-NEXT:    or a0, a0, a2
; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
; CHECK-NEXT:    vmv.s.x v8, a0
; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT:    vmseq.vi v0, v8, 1
; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
; CHECK-NEXT:    vmv.v.i v8, 0
; CHECK-NEXT:    lui a0, %hi(.LCPI0_0)
; CHECK-NEXT:    addi a0, a0, %lo(.LCPI0_0)
; CHECK-NEXT:    vle32.v v8, (a0), v0.t
; CHECK-NEXT:    vredor.vs v8, v8, v8
; CHECK-NEXT:    vmv.x.s a0, v8
; CHECK-NEXT:    ret
  %t35 = insertelement <4 x i16> poison, i16 %a, i64 0
  %t36 = insertelement <4 x i16> %t35, i16 %b, i64 1
  %t37 = insertelement <4 x i16> %t36, i16 %c, i64 2
  %t38 = insertelement <4 x i16> %t37, i16 %d, i64 3
  %t39 = icmp eq <4 x i16> %t38, <i16 1, i16 1, i16 1, i16 1>
  %t40 = select <4 x i1> %t39, <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
  %t41 = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %t40)
  ret i32 %t41
}

define i32 @test_scalarized(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
; CHECK-LABEL: test_scalarized:
; CHECK:       # %bb.0:
; CHECK-NEXT:    addi a0, a0, -1
; CHECK-NEXT:    seqz a0, a0
; CHECK-NEXT:    addi a1, a1, -1
; CHECK-NEXT:    seqz a1, a1
; CHECK-NEXT:    addi a2, a2, -1
; CHECK-NEXT:    seqz a2, a2
; CHECK-NEXT:    addi a3, a3, -1
; CHECK-NEXT:    seqz a3, a3
; CHECK-NEXT:    slli a0, a0, 19
; CHECK-NEXT:    slli a1, a1, 18
; CHECK-NEXT:    slli a2, a2, 17
; CHECK-NEXT:    slli a3, a3, 16
; CHECK-NEXT:    or a0, a0, a1
; CHECK-NEXT:    or a2, a2, a3
; CHECK-NEXT:    or a0, a0, a2
; CHECK-NEXT:    ret
  %t39.i0 = icmp eq i16 %a, 1
  %t39.i1 = icmp eq i16 %b, 1
  %t39.i2 = icmp eq i16 %c, 1
  %t39.i3 = icmp eq i16 %d, 1
  %t40.i0 = select i1 %t39.i0, i32 524288, i32 0
  %t40.i1 = select i1 %t39.i1, i32 262144, i32 0
  %t40.i2 = select i1 %t39.i2, i32 131072, i32 0
  %t40.i3 = select i1 %t39.i3, i32 65536, i32 0
  %or.rdx0 = or i32 %t40.i0, %t40.i1
  %or.rdx1 = or i32 %t40.i2, %t40.i3
  %or.rdx2 = or i32 %or.rdx0, %or.rdx1
  ret i32 %or.rdx2
}

./opt -S example.ll -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions