-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Make use of byte FPR stores for bytes extracted from vectors #131793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>; | |
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added. | ||
def amdgpuBufferStridedPointer : ValueType<192, 235>; | ||
|
||
def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be possible to use the existing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically, I added |
||
|
||
let isNormalValueType = false in { | ||
def token : ValueType<0, 504>; // TokenTy | ||
def MetadataVT : ValueType<0, 505> { // Metadata | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING | ||
; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT | ||
; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT | ||
; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s | ||
; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s | ||
; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s | ||
|
||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
|
@@ -106,18 +106,11 @@ entry: | |
} | ||
|
||
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) { | ||
; CHECK-NONSTREAMING-LABEL: test_str_lane_s8: | ||
; CHECK-NONSTREAMING: // %bb.0: // %entry | ||
; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7] | ||
; CHECK-NONSTREAMING-NEXT: strb w8, [x0] | ||
; CHECK-NONSTREAMING-NEXT: ret | ||
; | ||
; STREAMING-COMPAT-LABEL: test_str_lane_s8: | ||
; STREAMING-COMPAT: // %bb.0: // %entry | ||
; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] | ||
; STREAMING-COMPAT-NEXT: fmov w8, s0 | ||
; STREAMING-COMPAT-NEXT: strb w8, [x0] | ||
; STREAMING-COMPAT-NEXT: ret | ||
; CHECK-LABEL: test_str_lane_s8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: mov z0.b, z0.b[7] | ||
; CHECK-NEXT: str b0, [x0] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 16 x i8> %b, i32 7 | ||
|
@@ -128,8 +121,7 @@ entry: | |
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) { | ||
; CHECK-LABEL: test_str_lane0_s8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: fmov w8, s0 | ||
; CHECK-NEXT: strb w8, [x0] | ||
; CHECK-NEXT: str b0, [x0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Beautiful! Avoiding a cross-regclass move. :) |
||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
|
@@ -201,6 +193,19 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc | |
ret void | ||
} | ||
|
||
define void @test_str_reduction_i32_to_i8(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) { | ||
; CHECK-LABEL: test_str_reduction_i32_to_i8: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: uaddv d0, p0, z0.s | ||
; CHECK-NEXT: str b0, [x0] | ||
; CHECK-NEXT: ret | ||
|
||
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) | ||
%trunc = trunc i64 %reduce to i8 | ||
store i8 %trunc, ptr %ptr, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) { | ||
; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset: | ||
; CHECK: // %bb.0: | ||
|
@@ -242,6 +247,20 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4 | |
ret void | ||
} | ||
|
||
define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) { | ||
; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: uaddv d0, p0, z0.s | ||
; CHECK-NEXT: stur b0, [x0, #-8] | ||
; CHECK-NEXT: ret | ||
|
||
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) | ||
%trunc = trunc i64 %reduce to i8 | ||
%out_ptr = getelementptr inbounds i8, ptr %ptr, i64 -8 | ||
store i8 %trunc, ptr %out_ptr, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) { | ||
; CHECK-LABEL: test_str_lane_s32_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
|
@@ -297,18 +316,11 @@ entry: | |
} | ||
|
||
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) { | ||
; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset: | ||
; CHECK-NONSTREAMING: // %bb.0: // %entry | ||
; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7] | ||
; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8] | ||
; CHECK-NONSTREAMING-NEXT: ret | ||
; | ||
; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset: | ||
; STREAMING-COMPAT: // %bb.0: // %entry | ||
; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7] | ||
; STREAMING-COMPAT-NEXT: fmov w8, s0 | ||
; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8] | ||
; STREAMING-COMPAT-NEXT: ret | ||
; CHECK-LABEL: test_str_lane_s8_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: mov z0.b, z0.b[7] | ||
; CHECK-NEXT: stur b0, [x0, #-8] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 16 x i8> %b, i32 7 | ||
|
@@ -320,8 +332,7 @@ entry: | |
define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) { | ||
; CHECK-LABEL: test_str_lane0_s8_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: fmov w8, s0 | ||
; CHECK-NEXT: sturb w8, [x0, #-8] | ||
; CHECK-NEXT: stur b0, [x0, #-8] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
|
@@ -385,6 +396,48 @@ entry: | |
ret void | ||
} | ||
|
||
|
||
define void @test_str_trunc_lane_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane_s32_to_s8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: mov z0.s, z0.s[3] | ||
; CHECK-NEXT: str b0, [x0] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 4 x i32> %b, i32 3 | ||
%trunc = trunc i32 %0 to i8 | ||
store i8 %trunc, ptr %a, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_trunc_lane0_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: str b0, [x0] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 4 x i32> %b, i32 0 | ||
%trunc = trunc i32 %0 to i8 | ||
store i8 %trunc, ptr %a, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_trunc_lane_s64_to_s8(ptr %a, <vscale x 2 x i64> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane_s64_to_s8: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: mov z0.d, z0.d[3] | ||
; CHECK-NEXT: str b0, [x0] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 2 x i64> %b, i32 3 | ||
%trunc = trunc i64 %0 to i8 | ||
store i8 %trunc, ptr %a, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
|
@@ -413,3 +466,47 @@ entry: | |
store i16 %trunc, ptr %out_ptr, align 2 | ||
ret void | ||
} | ||
|
||
define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: mov z0.s, z0.s[3] | ||
; CHECK-NEXT: stur b0, [x0, #-8] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 4 x i32> %b, i32 3 | ||
%trunc = trunc i32 %0 to i8 | ||
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 | ||
store i8 %trunc, ptr %out_ptr, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: stur b0, [x0, #-8] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 4 x i32> %b, i32 0 | ||
%trunc = trunc i32 %0 to i8 | ||
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 | ||
store i8 %trunc, ptr %out_ptr, align 1 | ||
ret void | ||
} | ||
|
||
define void @test_str_trunc_lane_s64_to_s8_negative_offset(ptr %a, <vscale x 2 x i64> %b) { | ||
; CHECK-LABEL: test_str_trunc_lane_s64_to_s8_negative_offset: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: mov z0.d, z0.d[3] | ||
; CHECK-NEXT: stur b0, [x0, #-8] | ||
; CHECK-NEXT: ret | ||
|
||
entry: | ||
%0 = extractelement <vscale x 2 x i64> %b, i32 3 | ||
%trunc = trunc i64 %0 to i8 | ||
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8 | ||
store i8 %trunc, ptr %out_ptr, align 1 | ||
ret void | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Curious: Why are 8-bit FPR considered illegal if the SIMD/FP registers can be addressed as B registers?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know the full history here, but for the other sizes 16,32,64-bit there's a legal scalar type floating-point (half, float, double) that can be mapped to an FPR register. I don't think that's the case with 8-bit, there are some FP8 extensions, but I believe they are only for vectors of FP8 types, so reuse the existing integer vector types in IR. I think addressing b-registers directly is only used in a few places (such as loads or stores).