@@ -10557,80 +10557,7 @@ pub unsafe fn vcmlaq_rot270_laneq_f32<const LANE: i32>(a: float32x4_t, b: float3
10557
10557
vcmlaq_rot270_f32(a, b, c)
10558
10558
}
10559
10559
10560
- /// Dot product arithmetic
10561
- ///
10562
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_s32)
10563
- #[inline]
10564
- #[target_feature(enable = "neon,dotprod")]
10565
- #[cfg_attr(test, assert_instr(sdot))]
10566
- pub unsafe fn vdot_s32(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
10567
- #[allow(improper_ctypes)]
10568
- extern "unadjusted" {
10569
- #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v2i32.v8i8")]
10570
- fn vdot_s32_(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t;
10571
- }
10572
- vdot_s32_(a, b, c)
10573
- }
10574
-
10575
- /// Dot product arithmetic
10576
- ///
10577
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_s32)
10578
- #[inline]
10579
- #[target_feature(enable = "neon,dotprod")]
10580
- #[cfg_attr(test, assert_instr(sdot))]
10581
- pub unsafe fn vdotq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
10582
- #[allow(improper_ctypes)]
10583
- extern "unadjusted" {
10584
- #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sdot.v4i32.v16i8")]
10585
- fn vdotq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
10586
- }
10587
- vdotq_s32_(a, b, c)
10588
- }
10589
-
10590
- /// Dot product arithmetic
10591
- ///
10592
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_u32)
10593
- #[inline]
10594
- #[target_feature(enable = "neon,dotprod")]
10595
- #[cfg_attr(test, assert_instr(udot))]
10596
- pub unsafe fn vdot_u32(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
10597
- #[allow(improper_ctypes)]
10598
- extern "unadjusted" {
10599
- #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v2i32.v8i8")]
10600
- fn vdot_u32_(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t;
10601
- }
10602
- vdot_u32_(a, b, c)
10603
- }
10604
-
10605
- /// Dot product arithmetic
10606
- ///
10607
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_u32)
10608
- #[inline]
10609
- #[target_feature(enable = "neon,dotprod")]
10610
- #[cfg_attr(test, assert_instr(udot))]
10611
- pub unsafe fn vdotq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
10612
- #[allow(improper_ctypes)]
10613
- extern "unadjusted" {
10614
- #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.udot.v4i32.v16i8")]
10615
- fn vdotq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
10616
- }
10617
- vdotq_u32_(a, b, c)
10618
- }
10619
-
10620
- /// Dot product arithmetic
10621
- ///
10622
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_s32)
10623
- #[inline]
10624
- #[target_feature(enable = "neon,dotprod")]
10625
- #[cfg_attr(test, assert_instr(sdot, LANE = 0))]
10626
- #[rustc_legacy_const_generics(3)]
10627
- pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x8_t) -> int32x2_t {
10628
- static_assert_uimm_bits!(LANE, 1);
10629
- let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10630
- vdot_s32(a, b, c)
10631
- }
10632
-
10633
- /// Dot product arithmetic
10560
+ /// Dot product arithmetic (indexed)
10634
10561
///
10635
10562
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_s32)
10636
10563
#[inline]
@@ -10639,24 +10566,12 @@ pub unsafe fn vdot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x
10639
10566
#[rustc_legacy_const_generics(3)]
10640
10567
pub unsafe fn vdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: int8x16_t) -> int32x2_t {
10641
10568
static_assert_uimm_bits!(LANE, 2);
10642
- let c: int8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10643
- vdot_s32(a, b, c)
10644
- }
10645
-
10646
- /// Dot product arithmetic
10647
- ///
10648
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_s32)
10649
- #[inline]
10650
- #[target_feature(enable = "neon,dotprod")]
10651
- #[cfg_attr(test, assert_instr(sdot, LANE = 0))]
10652
- #[rustc_legacy_const_generics(3)]
10653
- pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x8_t) -> int32x4_t {
10654
- static_assert_uimm_bits!(LANE, 1);
10655
- let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10656
- vdotq_s32(a, b, c)
10569
+ let c: int32x4_t = transmute(c);
10570
+ let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
10571
+ vdot_s32(a, b, transmute(c))
10657
10572
}
10658
10573
10659
- /// Dot product arithmetic
10574
+ /// Dot product arithmetic (indexed)
10660
10575
///
10661
10576
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_s32)
10662
10577
#[inline]
@@ -10665,24 +10580,12 @@ pub unsafe fn vdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int
10665
10580
#[rustc_legacy_const_generics(3)]
10666
10581
pub unsafe fn vdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
10667
10582
static_assert_uimm_bits!(LANE, 2);
10668
- let c: int8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10669
- vdotq_s32(a, b, c)
10670
- }
10671
-
10672
- /// Dot product arithmetic
10673
- ///
10674
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_lane_u32)
10675
- #[inline]
10676
- #[target_feature(enable = "neon,dotprod")]
10677
- #[cfg_attr(test, assert_instr(udot, LANE = 0))]
10678
- #[rustc_legacy_const_generics(3)]
10679
- pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x8_t) -> uint32x2_t {
10680
- static_assert_uimm_bits!(LANE, 1);
10681
- let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10682
- vdot_u32(a, b, c)
10583
+ let c: int32x4_t = transmute(c);
10584
+ let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
10585
+ vdotq_s32(a, b, transmute(c))
10683
10586
}
10684
10587
10685
- /// Dot product arithmetic
10588
+ /// Dot product arithmetic (indexed)
10686
10589
///
10687
10590
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdot_laneq_u32)
10688
10591
#[inline]
@@ -10691,24 +10594,12 @@ pub unsafe fn vdot_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uin
10691
10594
#[rustc_legacy_const_generics(3)]
10692
10595
pub unsafe fn vdot_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint8x8_t, c: uint8x16_t) -> uint32x2_t {
10693
10596
static_assert_uimm_bits!(LANE, 2);
10694
- let c: uint8x8_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10695
- vdot_u32(a, b, c)
10696
- }
10697
-
10698
- /// Dot product arithmetic
10699
- ///
10700
- /// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_lane_u32)
10701
- #[inline]
10702
- #[target_feature(enable = "neon,dotprod")]
10703
- #[cfg_attr(test, assert_instr(udot, LANE = 0))]
10704
- #[rustc_legacy_const_generics(3)]
10705
- pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x8_t) -> uint32x4_t {
10706
- static_assert_uimm_bits!(LANE, 1);
10707
- let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10708
- vdotq_u32(a, b, c)
10597
+ let c: uint32x4_t = transmute(c);
10598
+ let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
10599
+ vdot_u32(a, b, transmute(c))
10709
10600
}
10710
10601
10711
- /// Dot product arithmetic
10602
+ /// Dot product arithmetic (indexed)
10712
10603
///
10713
10604
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vdotq_laneq_u32)
10714
10605
#[inline]
@@ -10717,8 +10608,9 @@ pub unsafe fn vdotq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: u
10717
10608
#[rustc_legacy_const_generics(3)]
10718
10609
pub unsafe fn vdotq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
10719
10610
static_assert_uimm_bits!(LANE, 2);
10720
- let c: uint8x16_t = simd_shuffle!(c, c, [4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3, 4 * LANE as u32, 4 * LANE as u32 + 1, 4 * LANE as u32 + 2, 4 * LANE as u32 + 3]);
10721
- vdotq_u32(a, b, c)
10611
+ let c: uint32x4_t = transmute(c);
10612
+ let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
10613
+ vdotq_u32(a, b, transmute(c))
10722
10614
}
10723
10615
10724
10616
/// Maximum (vector)
@@ -23759,122 +23651,42 @@ mod test {
23759
23651
assert_eq!(r, e);
23760
23652
}
23761
23653
23762
- #[simd_test(enable = "neon,dotprod")]
23763
- unsafe fn test_vdot_s32() {
23764
- let a: i32x2 = i32x2::new(1, 2);
23765
- let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23766
- let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23767
- let e: i32x2 = i32x2::new(31, 176);
23768
- let r: i32x2 = transmute(vdot_s32(transmute(a), transmute(b), transmute(c)));
23769
- assert_eq!(r, e);
23770
- }
23771
-
23772
- #[simd_test(enable = "neon,dotprod")]
23773
- unsafe fn test_vdotq_s32() {
23774
- let a: i32x4 = i32x4::new(1, 2, 1, 2);
23775
- let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23776
- let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23777
- let e: i32x4 = i32x4::new(31, 176, 31, 176);
23778
- let r: i32x4 = transmute(vdotq_s32(transmute(a), transmute(b), transmute(c)));
23779
- assert_eq!(r, e);
23780
- }
23781
-
23782
- #[simd_test(enable = "neon,dotprod")]
23783
- unsafe fn test_vdot_u32() {
23784
- let a: u32x2 = u32x2::new(1, 2);
23785
- let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23786
- let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23787
- let e: u32x2 = u32x2::new(31, 176);
23788
- let r: u32x2 = transmute(vdot_u32(transmute(a), transmute(b), transmute(c)));
23789
- assert_eq!(r, e);
23790
- }
23791
-
23792
- #[simd_test(enable = "neon,dotprod")]
23793
- unsafe fn test_vdotq_u32() {
23794
- let a: u32x4 = u32x4::new(1, 2, 1, 2);
23795
- let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23796
- let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23797
- let e: u32x4 = u32x4::new(31, 176, 31, 176);
23798
- let r: u32x4 = transmute(vdotq_u32(transmute(a), transmute(b), transmute(c)));
23799
- assert_eq!(r, e);
23800
- }
23801
-
23802
- #[simd_test(enable = "neon,dotprod")]
23803
- unsafe fn test_vdot_lane_s32() {
23804
- let a: i32x2 = i32x2::new(1, 2);
23805
- let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23806
- let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23807
- let e: i32x2 = i32x2::new(31, 72);
23808
- let r: i32x2 = transmute(vdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
23809
- assert_eq!(r, e);
23810
- }
23811
-
23812
23654
#[simd_test(enable = "neon,dotprod")]
23813
23655
unsafe fn test_vdot_laneq_s32() {
23814
23656
let a: i32x2 = i32x2::new(1, 2);
23815
- let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23657
+ let b: i8x8 = i8x8::new(- 1, 2, 3, 4, 5, 6, 7, 8);
23816
23658
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23817
- let e: i32x2 = i32x2::new(31 , 72);
23659
+ let e: i32x2 = i32x2::new(29 , 72);
23818
23660
let r: i32x2 = transmute(vdot_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
23819
23661
assert_eq!(r, e);
23820
23662
}
23821
23663
23822
- #[simd_test(enable = "neon,dotprod")]
23823
- unsafe fn test_vdotq_lane_s32() {
23824
- let a: i32x4 = i32x4::new(1, 2, 1, 2);
23825
- let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23826
- let c: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23827
- let e: i32x4 = i32x4::new(31, 72, 31, 72);
23828
- let r: i32x4 = transmute(vdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
23829
- assert_eq!(r, e);
23830
- }
23831
-
23832
23664
#[simd_test(enable = "neon,dotprod")]
23833
23665
unsafe fn test_vdotq_laneq_s32() {
23834
23666
let a: i32x4 = i32x4::new(1, 2, 1, 2);
23835
- let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23667
+ let b: i8x16 = i8x16::new(- 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23836
23668
let c: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23837
- let e: i32x4 = i32x4::new(31 , 72, 31, 72);
23669
+ let e: i32x4 = i32x4::new(29 , 72, 31, 72);
23838
23670
let r: i32x4 = transmute(vdotq_laneq_s32::<0>(transmute(a), transmute(b), transmute(c)));
23839
23671
assert_eq!(r, e);
23840
23672
}
23841
23673
23842
- #[simd_test(enable = "neon,dotprod")]
23843
- unsafe fn test_vdot_lane_u32() {
23844
- let a: u32x2 = u32x2::new(1, 2);
23845
- let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23846
- let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23847
- let e: u32x2 = u32x2::new(31, 72);
23848
- let r: u32x2 = transmute(vdot_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
23849
- assert_eq!(r, e);
23850
- }
23851
-
23852
23674
#[simd_test(enable = "neon,dotprod")]
23853
23675
unsafe fn test_vdot_laneq_u32() {
23854
23676
let a: u32x2 = u32x2::new(1, 2);
23855
- let b: u8x8 = u8x8::new(1 , 2, 3, 4, 5, 6, 7, 8);
23677
+ let b: u8x8 = u8x8::new(255 , 2, 3, 4, 5, 6, 7, 8);
23856
23678
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23857
- let e: u32x2 = u32x2::new(31 , 72);
23679
+ let e: u32x2 = u32x2::new(285 , 72);
23858
23680
let r: u32x2 = transmute(vdot_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
23859
23681
assert_eq!(r, e);
23860
23682
}
23861
23683
23862
- #[simd_test(enable = "neon,dotprod")]
23863
- unsafe fn test_vdotq_lane_u32() {
23864
- let a: u32x4 = u32x4::new(1, 2, 1, 2);
23865
- let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23866
- let c: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
23867
- let e: u32x4 = u32x4::new(31, 72, 31, 72);
23868
- let r: u32x4 = transmute(vdotq_lane_u32::<0>(transmute(a), transmute(b), transmute(c)));
23869
- assert_eq!(r, e);
23870
- }
23871
-
23872
23684
#[simd_test(enable = "neon,dotprod")]
23873
23685
unsafe fn test_vdotq_laneq_u32() {
23874
23686
let a: u32x4 = u32x4::new(1, 2, 1, 2);
23875
- let b: u8x16 = u8x16::new(1 , 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23687
+ let b: u8x16 = u8x16::new(255 , 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23876
23688
let c: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8);
23877
- let e: u32x4 = u32x4::new(31 , 72, 31, 72);
23689
+ let e: u32x4 = u32x4::new(285 , 72, 31, 72);
23878
23690
let r: u32x4 = transmute(vdotq_laneq_u32::<0>(transmute(a), transmute(b), transmute(c)));
23879
23691
assert_eq!(r, e);
23880
23692
}
0 commit comments