-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RISCV] Merge shuffle sources if lanes are disjoint #119401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e6bda96
862bffd
99e1f12
d074e75
7a83095
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -395,3 +395,76 @@ define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) { | |
%s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> | ||
ret <4 x half> %s | ||
} | ||
|
||
define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) { | ||
; CHECK-LABEL: shuffle_disjoint_lanes: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, %hi(.LCPI30_0) | ||
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) | ||
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma | ||
; CHECK-NEXT: vle8.v v16, (a0) | ||
; CHECK-NEXT: lui a0, 11 | ||
; CHECK-NEXT: addi a0, a0, -1366 | ||
; CHECK-NEXT: vmv.s.x v0, a0 | ||
; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vsext.vf2 v18, v16 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma | ||
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18 | ||
; CHECK-NEXT: ret | ||
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16> | ||
ret <16 x float> %out | ||
} | ||
|
||
define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x float> %w) { | ||
; CHECK-LABEL: shuffle_disjoint_lanes_one_identity: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, %hi(.LCPI31_0) | ||
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0) | ||
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu | ||
; CHECK-NEXT: vle16.v v16, (a0) | ||
; CHECK-NEXT: li a0, -272 | ||
; CHECK-NEXT: vmv.s.x v0, a0 | ||
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t | ||
; CHECK-NEXT: ret | ||
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22> | ||
ret <16 x float> %out | ||
} | ||
|
||
define <16 x float> @shuffle_disjoint_lanes_one_broadcast(<16 x float> %v, <16 x float> %w) { | ||
; CHECK-LABEL: shuffle_disjoint_lanes_one_broadcast: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, %hi(.LCPI32_0) | ||
; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0) | ||
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu | ||
; CHECK-NEXT: vle16.v v20, (a0) | ||
; CHECK-NEXT: lui a0, 15 | ||
; CHECK-NEXT: addi a0, a0, 240 | ||
; CHECK-NEXT: vmv.s.x v0, a0 | ||
; CHECK-NEXT: vrgather.vi v16, v8, 7 | ||
; CHECK-NEXT: vrgatherei16.vv v16, v12, v20, v0.t | ||
; CHECK-NEXT: vmv.v.v v8, v16 | ||
; CHECK-NEXT: ret | ||
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 26, i32 30, i32 22, i32 18, i32 7, i32 7, i32 7, i32 7, i32 24, i32 28, i32 20, i32 16> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test isn't exactly convincing that we want to avoid the splat case. I think maybe this hints at the difference between a splatvalue (from scalar) and the splat shuffle (from mask). The later requires a vrgather.vi, whereas the former is a vmerge.vxm. Maybe rework the tests to cover both cases separately? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's a diff from a regression if we remove the isSplatValue guard: ; CHECK-LABEL: shuffle_vx_v4i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.v.i v0, 6
-; CHECK-NEXT: vmerge.vim v8, v8, 5, v0
+; CHECK-NEXT: vmv.v.i v0, 9
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%s = shufflevector <4 x i16> %x, <4 x i16> <i16 5, i16 5, i16 5, i16 5>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
ret <4 x i16> %s And here's one if remove the isSplatMask guard: --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -364,12 +364,14 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) {
define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
; CHECK-LABEL: splat_ve2_we0:
; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v0, 4
; CHECK-NEXT: li a0, 66
-; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0
; CHECK-NEXT: vmv.s.x v0, a0
-; CHECK-NEXT: vrgather.vi v10, v8, 2
-; CHECK-NEXT: vrgather.vi v10, v9, 0, v0.t
-; CHECK-NEXT: vmv1r.v v8, v10
+; CHECK-NEXT: vmv.v.i v8, 2
+; CHECK-NEXT: vmerge.vim v10, v8, 0, v0
+; CHECK-NEXT: vrgather.vv v8, v9, v10
; CHECK-NEXT: ret
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 2, i32 2, i32 2, i32 2, i32 8, i32 2>
ret <8 x i8> %shuff I think ideally we would really only want to do this lowering when we know we're going to end up with at least a vrgather.vv. Maybe in a follow up this could be reworked as a late combine on VRGATHER_VV_VL instead? It might be a bit trickier because you would need to reconstruct the original mask. |
||
ret <16 x float> %out | ||
} | ||
|
||
define <16 x float> @shuffle_disjoint_lanes_one_splat(float %v, <16 x float> %w) { | ||
; CHECK-LABEL: shuffle_disjoint_lanes_one_splat: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: lui a0, %hi(.LCPI33_0) | ||
; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0) | ||
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu | ||
; CHECK-NEXT: vle16.v v16, (a0) | ||
; CHECK-NEXT: lui a0, 15 | ||
; CHECK-NEXT: addi a0, a0, 240 | ||
; CHECK-NEXT: vmv.s.x v0, a0 | ||
; CHECK-NEXT: vfmv.v.f v12, fa0 | ||
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16, v0.t | ||
; CHECK-NEXT: vmv.v.v v8, v12 | ||
; CHECK-NEXT: ret | ||
%head = insertelement <16 x float> poison, float %v, i32 0 | ||
%splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer | ||
%out = shufflevector <16 x float> %splat, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16> | ||
ret <16 x float> %out | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you reverse the select here so that it uses the same order as the generic fallthrough below? From prior experience, our vmerge.vxm matching is oddly fragile. I'd like to remove this unrelated change if possible.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, I've reversed it to match the generic fallthrough when SwapOps is false. But it's worth pointing out that this simultaneously removes some mask diffs and introduces some others too