@@ -16775,6 +16775,106 @@ pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
16775
16775
vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
16776
16776
}
16777
16777
16778
+ /// Dot product vector form with unsigned and signed integers
16779
+ ///
16780
+ /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32)
16781
+ #[inline]
16782
+ #[target_feature(enable = "neon,i8mm")]
16783
+ #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16784
+ #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
16785
+ #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
16786
+ pub unsafe fn vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
16787
+ #[allow(improper_ctypes)]
16788
+ extern "unadjusted" {
16789
+ #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v2i32.v8i8")]
16790
+ #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v2i32.v8i8")]
16791
+ fn vusdot_s32_(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t;
16792
+ }
16793
+ vusdot_s32_(a, b, c)
16794
+ }
16795
+
16796
+ /// Dot product vector form with unsigned and signed integers
16797
+ ///
16798
+ /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_s32)
16799
+ #[inline]
16800
+ #[target_feature(enable = "neon,i8mm")]
16801
+ #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16802
+ #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
16803
+ #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
16804
+ pub unsafe fn vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
16805
+ #[allow(improper_ctypes)]
16806
+ extern "unadjusted" {
16807
+ #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v4i32.v16i8")]
16808
+ #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v4i32.v16i8")]
16809
+ fn vusdotq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
16810
+ }
16811
+ vusdotq_s32_(a, b, c)
16812
+ }
16813
+
16814
+ /// Dot product index form with unsigned and signed integers
16815
+ ///
16816
+ /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_lane_s32)
16817
+ #[inline]
16818
+ #[target_feature(enable = "neon,i8mm")]
16819
+ #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16820
+ #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
16821
+ #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
16822
+ #[rustc_legacy_const_generics(3)]
16823
+ pub unsafe fn vusdot_lane_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
16824
+ static_assert_uimm_bits!(LANE, 1);
16825
+ let c: int32x2_t = transmute(c);
16826
+ let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
16827
+ vusdot_s32(a, b, transmute(c))
16828
+ }
16829
+
16830
+ /// Dot product index form with unsigned and signed integers
16831
+ ///
16832
+ /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32)
16833
+ #[inline]
16834
+ #[target_feature(enable = "neon,i8mm")]
16835
+ #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16836
+ #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
16837
+ #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
16838
+ #[rustc_legacy_const_generics(3)]
16839
+ pub unsafe fn vusdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t {
16840
+ static_assert_uimm_bits!(LANE, 1);
16841
+ let c: int32x2_t = transmute(c);
16842
+ let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
16843
+ vusdotq_s32(a, b, transmute(c))
16844
+ }
16845
+
16846
+ /// Dot product index form with signed and unsigned integers
16847
+ ///
16848
+ /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_lane_s32)
16849
+ #[inline]
16850
+ #[target_feature(enable = "neon,i8mm")]
16851
+ #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16852
+ #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
16853
+ #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
16854
+ #[rustc_legacy_const_generics(3)]
16855
+ pub unsafe fn vsudot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t {
16856
+ static_assert_uimm_bits!(LANE, 1);
16857
+ let c: uint32x2_t = transmute(c);
16858
+ let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
16859
+ vusdot_s32(a, transmute(c), b)
16860
+ }
16861
+
16862
+ /// Dot product index form with signed and unsigned integers
16863
+ ///
16864
+ /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32)
16865
+ #[inline]
16866
+ #[target_feature(enable = "neon,i8mm")]
16867
+ #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16868
+ #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
16869
+ #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
16870
+ #[rustc_legacy_const_generics(3)]
16871
+ pub unsafe fn vsudotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t {
16872
+ static_assert_uimm_bits!(LANE, 1);
16873
+ let c: uint32x2_t = transmute(c);
16874
+ let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
16875
+ vusdotq_s32(a, transmute(c), b)
16876
+ }
16877
+
16778
16878
/// Multiply
16779
16879
///
16780
16880
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s8)
@@ -37823,6 +37923,94 @@ mod test {
37823
37923
assert_eq!(r, e);
37824
37924
}
37825
37925
37926
+ #[simd_test(enable = "neon,i8mm")]
37927
+ unsafe fn test_vusdot_s32() {
37928
+ let a: i32x2 = i32x2::new(1000, -4200);
37929
+ let b: u8x8 = u8x8::new(100, 205, 110, 195, 120, 185, 130, 175);
37930
+ let c: i8x8 = i8x8::new(0, 1, 2, 3, -1, -2, -3, -4);
37931
+ let e: i32x2 = i32x2::new(2010, -5780);
37932
+ let r: i32x2 = transmute(vusdot_s32(transmute(a), transmute(b), transmute(c)));
37933
+ assert_eq!(r, e);
37934
+ }
37935
+
37936
+ #[simd_test(enable = "neon,i8mm")]
37937
+ unsafe fn test_vusdotq_s32() {
37938
+ let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
37939
+ let b: u8x16 = u8x16::new(100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135);
37940
+ let c: i8x16 = i8x16::new(0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8);
37941
+ let e: i32x4 = i32x4::new(2010, -5780, 2370, -1940);
37942
+ let r: i32x4 = transmute(vusdotq_s32(transmute(a), transmute(b), transmute(c)));
37943
+ assert_eq!(r, e);
37944
+ }
37945
+
37946
+ #[simd_test(enable = "neon,i8mm")]
37947
+ unsafe fn test_vusdot_lane_s32() {
37948
+ let a: i32x2 = i32x2::new(1000, -4200);
37949
+ let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37950
+ let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37951
+ let e: i32x2 = i32x2::new(2100, -2700);
37952
+ let r: i32x2 = transmute(vusdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
37953
+ assert_eq!(r, e);
37954
+
37955
+ let a: i32x2 = i32x2::new(1000, -4200);
37956
+ let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37957
+ let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37958
+ let e: i32x2 = i32x2::new(260, -5180);
37959
+ let r: i32x2 = transmute(vusdot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
37960
+ assert_eq!(r, e);
37961
+ }
37962
+
37963
+ #[simd_test(enable = "neon,i8mm")]
37964
+ unsafe fn test_vusdotq_lane_s32() {
37965
+ let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
37966
+ let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
37967
+ let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37968
+ let e: i32x4 = i32x4::new(2100, -2700, 900, 4300);
37969
+ let r: i32x4 = transmute(vusdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
37970
+ assert_eq!(r, e);
37971
+
37972
+ let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
37973
+ let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
37974
+ let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37975
+ let e: i32x4 = i32x4::new(260, -5180, -2220, 540);
37976
+ let r: i32x4 = transmute(vusdotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
37977
+ assert_eq!(r, e);
37978
+ }
37979
+
37980
+ #[simd_test(enable = "neon,i8mm")]
37981
+ unsafe fn test_vsudot_lane_s32() {
37982
+ let a: i32x2 = i32x2::new(-2000, 4200);
37983
+ let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37984
+ let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37985
+ let e: i32x2 = i32x2::new(-900, 3460);
37986
+ let r: i32x2 = transmute(vsudot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
37987
+ assert_eq!(r, e);
37988
+
37989
+ let a: i32x2 = i32x2::new(-2000, 4200);
37990
+ let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37991
+ let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37992
+ let e: i32x2 = i32x2::new(-500, 3220);
37993
+ let r: i32x2 = transmute(vsudot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
37994
+ assert_eq!(r, e);
37995
+ }
37996
+
37997
+ #[simd_test(enable = "neon,i8mm")]
37998
+ unsafe fn test_vsudotq_lane_s32() {
37999
+ let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
38000
+ let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
38001
+ let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
38002
+ let e: i32x4 = i32x4::new(-900, 3460, -3580, -2420);
38003
+ let r: i32x4 = transmute(vsudotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
38004
+ assert_eq!(r, e);
38005
+
38006
+ let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
38007
+ let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
38008
+ let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
38009
+ let e: i32x4 = i32x4::new(-500, 3220, -4460, -3940);
38010
+ let r: i32x4 = transmute(vsudotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
38011
+ assert_eq!(r, e);
38012
+ }
38013
+
37826
38014
#[simd_test(enable = "neon")]
37827
38015
unsafe fn test_vmul_s8() {
37828
38016
let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);
0 commit comments