-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Make use of byte FPR stores for bytes extracted from vectors #131793
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThis helps avoid some pointless Follow on from: #129756 Patch is 42.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131793.diff 32 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index fc1a95e33380b..42c4830e94220 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>;
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added.
def amdgpuBufferStridedPointer : ValueType<192, 235>;
+def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64)
+
let isNormalValueType = false in {
def token : ValueType<0, 504>; // TokenTy
def MetadataVT : ValueType<0, 505> { // Metadata
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 0554b6387c5e6..c769568253b12 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -198,6 +198,8 @@ std::string EVT::getEVTString() const {
return "amdgpuBufferFatPointer";
case MVT::amdgpuBufferStridedPointer:
return "amdgpuBufferStridedPointer";
+ case MVT::vi8:
+ return "vi8";
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 879b83f94b79a..a339c11420602 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -401,6 +401,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasFPARMv8()) {
+ addRegisterClass(MVT::vi8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c61e3a613f6f..1c1ff656db910 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3575,7 +3575,7 @@ defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
[(set (f16 FPR16Op:$Rt),
@@ -3763,7 +3763,7 @@ defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
[(set (f16 FPR16Op:$Rt),
@@ -4333,7 +4333,7 @@ defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
[(store (f16 FPR16Op:$Rt),
@@ -4451,6 +4451,8 @@ multiclass VecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop
}
let AddedComplexity = 19 in {
+ defm : VecStoreLane0Pat<am_indexed8, truncstorei8, v16i8, i32, vi8, bsub, uimm12s2, STRBui>;
+ defm : VecStoreLane0Pat<am_indexed8, truncstorei8, v4i32, i32, vi8, bsub, uimm12s2, STRBui>;
defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, i32, ssub, uimm12s4, STRSui>;
@@ -4469,7 +4471,7 @@ defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
[(store (f16 FPR16Op:$Rt),
@@ -4598,6 +4600,7 @@ multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
}
let AddedComplexity = 19 in {
+ defm : VecStoreULane0Pat<truncstorei8, v16i8, i32, vi8, bsub, STURBi>;
defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v8f16, f16, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v4i32, i32, i32, ssub, STURSi>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index fed9b7b173e9c..42ba1451650ed 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -497,7 +497,7 @@ def Q30 : AArch64Reg<30, "q30", [D30, D30_HI], ["v30", ""]>, DwarfRegAlias<B30
def Q31 : AArch64Reg<31, "q31", [D31, D31_HI], ["v31", ""]>, DwarfRegAlias<B31>;
}
-def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> {
+def FPR8 : RegisterClass<"AArch64", [i8, vi8], 8, (sequence "B%u", 0, 31)> {
let Size = 8;
let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::FPR8RegClassID, 0, 32>";
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3ee71c14c6bd4..1884a90828acb 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1827,6 +1827,43 @@ let Predicates = [HasSVE] in {
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
} // End HasSVE
+multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ ValueType SubRegTy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR,
+ Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
+ let Predicates = [HasSVE_or_SME] in {
+ // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+ }
+
+ // Non-zero immediate index:
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+}
+
+// Note: Types other than i8 are handled in performSTORECombine -- i8 is tricky
+// to handle before ISEL as it is not really a legal type in many places, nor
+// is its equivalently sized FP variant.
+let AddedComplexity = 19 in {
+ // Lane 0 truncating stores
+ // i32 -> i8
+ defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv4i32, i32, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv4i32, i32, vi8, bsub, simm9, STURBi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ // i64 -> i8
+ defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv2i64, i64, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv2i64, i64, vi8, bsub, simm9, STURBi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // i8 -> i8 (technically a truncate as the extracted type is i32)
+ defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv16i8, i32, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_B, sve_elm_idx_extdup_b>;
+ defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv16i8, i32, vi8, bsub, simm9, STURBi, DUP_ZZI_B, sve_elm_idx_extdup_b>;
+}
+
let Predicates = [HasSVE_or_SME] in {
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index d39c9bf760621..b91cb872a9e0a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING
-; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
@@ -106,18 +106,11 @@ entry:
}
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: strb w8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s8:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: strb w8, [x0]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, z0.b[7]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
@@ -128,8 +121,7 @@ entry:
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
entry:
@@ -201,6 +193,19 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
ret void
}
+define void @test_str_reduction_i32_to_i8(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ store i8 %trunc, ptr %ptr, align 1
+ ret void
+}
+
define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
; CHECK: // %bb.0:
@@ -242,6 +247,20 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
ret void
}
+define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ %out_ptr = getelementptr inbounds i8, ptr %ptr, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_lane_s32_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -297,18 +316,11 @@ entry:
}
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, z0.b[7]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
@@ -320,8 +332,7 @@ entry:
define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8_negative_offset:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sturb w8, [x0, #-8]
+; CHECK-NEXT: stur b0, [x0, #-8]
; CHECK-NEXT: ret
entry:
@@ -385,6 +396,48 @@ entry:
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -413,3 +466,47 @@ entry:
store i16 %trunc, ptr %out_ptr, align 2
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index fc0ba336b21cc..cdde359d09d7b 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -64,8 +64,7 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 24f2549cce785..03c7bad9efc22 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -184,8 +184,7 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
@@ -221,8 +220,7 @@ define void @or_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
@@ -258,8 +256,7 @@ define void @xor_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 2c065e0051cd7..7f2bebf584d8f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -615,11 +615,10 @@ define <1 x i8> @getL() {
; CHECK-NEXT: ; kill
; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L@GOTPAGEOFF]
-; Ultimately we should generate str b0, but right now, we match the vector
-; variant which does not allow to fold the immediate into the store.
-; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str b0, [x[[LDRGOT_REG]]]
; CHECK-NEXT: ret
-; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
define void @setL(<1 x i8> %t) {
store <1 x i8> %t, ptr @L, align 4
ret void
@@ -678,6 +677,6 @@ if.end.i:
call void (ptr, ...) @callee(ptr @.str.89, ptr @.str.90, double %sub)
unreachable
}
-declare void @callee(ptr nocapture readonly, ...)
+declare void @callee(ptr nocapture readonly, ...)
attributes #0 = { "target-cpu"="cyclone" }
diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll
index 6f87c66c87345..c63d66c4e7706 100644
--- a/llvm/test/CodeGen/AArch64/arm64-st1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SD-CHECK
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,GI-CHECK
; The instruction latencies of Exynos-M3 trigger the transform we see under the Exynos ch...
[truncated]
|
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>; | |||
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added. | |||
def amdgpuBufferStridedPointer : ValueType<192, 235>; | |||
|
|||
def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Curious: Why are 8-bit FPR considered illegal if the SIMD/FP registers can be addressed as B registers?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know the full history here, but for the other sizes 16,32,64-bit there's a legal scalar type floating-point (half, float, double) that can be mapped to an FPR register. I don't think that's the case with 8-bit, there are some FP8 extensions, but I believe they are only for vectors of FP8 types, so reuse the existing integer vector types in IR. I think addressing b-registers directly is only used in a few places (such as loads or stores).
@@ -128,8 +121,7 @@ entry: | |||
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) { | |||
; CHECK-LABEL: test_str_lane0_s8: | |||
; CHECK: // %bb.0: // %entry | |||
; CHECK-NEXT: fmov w8, s0 | |||
; CHECK-NEXT: strb w8, [x0] | |||
; CHECK-NEXT: str b0, [x0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Beautiful! Avoiding a cross-regclass move. :)
This helps avoid some pointless `fmovs` in some cases. Currently, this is done in ISEL as FPR bytes are problematic in SDAG (as neither GPR or FPR bytes are a legal type).
Ping |
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>; | |||
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added. | |||
def amdgpuBufferStridedPointer : ValueType<192, 235>; | |||
|
|||
def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be possible to use the existing v1i8
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried using v1i8
first, but you need to map the value type to the register class with addRegisterClass(MVT::vi8, &AArch64::FPR8RegClass)
and v1i8
already has a meaning and uses within the backend (and is an illegal type, which is scalarized). So adding addRegisterClass(MVT::v1i8, &AArch64::FPR8RegClass)
breaks existing lowerings.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Basically, I added vi8
so I could have a legal type for 8-bit value in FPR, since both v1i8
and i8
are illegal types.
I'll throw in my usual "can this be done as a combine" comment? I'm thinking that you could change the combine so that instead of casting to floating point and doing the extract and store, you could instead extract an |
I don't believe there's any existing ISEL lowering that will emit bsub stores (current lowerings of vector_extract + truncstore will result in an
to:
Which is not ideal as that instruction is not available in streaming-mode (and has more restrictive addressing). |
Sure but I would rather implement isel for a truncating store that might be generically useful (perhaps even allowing some cleanup of existing isel patterns to make streaming and non-streaming consistent assuming there are not performance compromises) than going down the route of needing to create a dedicated MVT for this one use case. |
I've created an alternative patch that lowers thing to |
This rewrites the fold from llvm#129756 to apply to all types, including stores of i8s. This required adding a new `aarch64mfp8` MVT to represent FPR8 types on AArch64, which can be used to extract and store 8-bit values using b sub-registers. Follow on from: llvm#129756 Closes: llvm#131793
This helps avoid some pointless
fmovs
in some cases. Currently, this is done in ISEL as FPR bytes are problematic in SDAG (as neither GPR or FPR bytes are a legal type).Follow on from: #129756