Skip to content

Commit cb4fdae

Browse files
jacobbramleyAmanieu
authored andcommitted
Add support for AArch64 i8mm *dot intrinsics.
This includes vsudot and vusdot, which perform mixed-signedness dot product operations.
1 parent 536212d commit cb4fdae

File tree

6 files changed

+427
-33
lines changed

6 files changed

+427
-33
lines changed

crates/core_arch/src/aarch64/neon/generated.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8353,6 +8353,62 @@ pub unsafe fn vst4q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2x4_t) {
83538353
vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
83548354
}
83558355

8356+
/// Dot product index form with unsigned and signed integers
8357+
///
8358+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_laneq_s32)
8359+
#[inline]
8360+
#[target_feature(enable = "neon,i8mm")]
8361+
#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
8362+
#[rustc_legacy_const_generics(3)]
8363+
pub unsafe fn vusdot_laneq_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x16_t) -> int32x2_t {
8364+
static_assert_uimm_bits!(LANE, 2);
8365+
let c: int32x4_t = transmute(c);
8366+
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
8367+
vusdot_s32(a, b, transmute(c))
8368+
}
8369+
8370+
/// Dot product index form with unsigned and signed integers
8371+
///
8372+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_laneq_s32)
8373+
#[inline]
8374+
#[target_feature(enable = "neon,i8mm")]
8375+
#[cfg_attr(test, assert_instr(usdot, LANE = 3))]
8376+
#[rustc_legacy_const_generics(3)]
8377+
pub unsafe fn vusdotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
8378+
static_assert_uimm_bits!(LANE, 2);
8379+
let c: int32x4_t = transmute(c);
8380+
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
8381+
vusdotq_s32(a, b, transmute(c))
8382+
}
8383+
8384+
/// Dot product index form with signed and unsigned integers
8385+
///
8386+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_laneq_s32)
8387+
#[inline]
8388+
#[target_feature(enable = "neon,i8mm")]
8389+
#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
8390+
#[rustc_legacy_const_generics(3)]
8391+
pub unsafe fn vsudot_laneq_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x16_t) -> int32x2_t {
8392+
static_assert_uimm_bits!(LANE, 2);
8393+
let c: uint32x4_t = transmute(c);
8394+
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
8395+
vusdot_s32(a, transmute(c), b)
8396+
}
8397+
8398+
/// Dot product index form with signed and unsigned integers
8399+
///
8400+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_laneq_s32)
8401+
#[inline]
8402+
#[target_feature(enable = "neon,i8mm")]
8403+
#[cfg_attr(test, assert_instr(sudot, LANE = 3))]
8404+
#[rustc_legacy_const_generics(3)]
8405+
pub unsafe fn vsudotq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x16_t) -> int32x4_t {
8406+
static_assert_uimm_bits!(LANE, 2);
8407+
let c: uint32x4_t = transmute(c);
8408+
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
8409+
vusdotq_s32(a, transmute(c), b)
8410+
}
8411+
83568412
/// Multiply
83578413
///
83588414
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_f64)
@@ -22184,6 +22240,46 @@ mod test {
2218422240
assert_eq!(r, e);
2218522241
}
2218622242

22243+
#[simd_test(enable = "neon,i8mm")]
22244+
unsafe fn test_vusdot_laneq_s32() {
22245+
let a: i32x2 = i32x2::new(1000, -4200);
22246+
let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
22247+
let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
22248+
let e: i32x2 = i32x2::new(-3420, -10140);
22249+
let r: i32x2 = transmute(vusdot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
22250+
assert_eq!(r, e);
22251+
}
22252+
22253+
#[simd_test(enable = "neon,i8mm")]
22254+
unsafe fn test_vusdotq_laneq_s32() {
22255+
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
22256+
let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
22257+
let c: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
22258+
let e: i32x4 = i32x4::new(-3420, -10140, -8460, -6980);
22259+
let r: i32x4 = transmute(vusdotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
22260+
assert_eq!(r, e);
22261+
}
22262+
22263+
#[simd_test(enable = "neon,i8mm")]
22264+
unsafe fn test_vsudot_laneq_s32() {
22265+
let a: i32x2 = i32x2::new(-2000, 4200);
22266+
let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
22267+
let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
22268+
let e: i32x2 = i32x2::new(300, 2740);
22269+
let r: i32x2 = transmute(vsudot_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
22270+
assert_eq!(r, e);
22271+
}
22272+
22273+
#[simd_test(enable = "neon,i8mm")]
22274+
unsafe fn test_vsudotq_laneq_s32() {
22275+
let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
22276+
let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
22277+
let c: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
22278+
let e: i32x4 = i32x4::new(300, 2740, -6220, -6980);
22279+
let r: i32x4 = transmute(vsudotq_laneq_s32::<3>(transmute(a), transmute(b), transmute(c)));
22280+
assert_eq!(r, e);
22281+
}
22282+
2218722283
#[simd_test(enable = "neon")]
2218822284
unsafe fn test_vmul_f64() {
2218922285
let a: f64 = 1.0;

crates/core_arch/src/arm_shared/neon/generated.rs

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16775,6 +16775,106 @@ pub unsafe fn vst4q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4x4_t) {
1677516775
vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
1677616776
}
1677716777

16778+
/// Dot product vector form with unsigned and signed integers
16779+
///
16780+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_s32)
16781+
#[inline]
16782+
#[target_feature(enable = "neon,i8mm")]
16783+
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16784+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
16785+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
16786+
pub unsafe fn vusdot_s32(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
16787+
#[allow(improper_ctypes)]
16788+
extern "unadjusted" {
16789+
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v2i32.v8i8")]
16790+
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v2i32.v8i8")]
16791+
fn vusdot_s32_(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t;
16792+
}
16793+
vusdot_s32_(a, b, c)
16794+
}
16795+
16796+
/// Dot product vector form with unsigned and signed integers
16797+
///
16798+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_s32)
16799+
#[inline]
16800+
#[target_feature(enable = "neon,i8mm")]
16801+
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16802+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot))]
16803+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot))]
16804+
pub unsafe fn vusdotq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
16805+
#[allow(improper_ctypes)]
16806+
extern "unadjusted" {
16807+
#[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usdot.v4i32.v16i8")]
16808+
#[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.usdot.v4i32.v16i8")]
16809+
fn vusdotq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
16810+
}
16811+
vusdotq_s32_(a, b, c)
16812+
}
16813+
16814+
/// Dot product index form with unsigned and signed integers
16815+
///
16816+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdot_lane_s32)
16817+
#[inline]
16818+
#[target_feature(enable = "neon,i8mm")]
16819+
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16820+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
16821+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
16822+
#[rustc_legacy_const_generics(3)]
16823+
pub unsafe fn vusdot_lane_s32<const LANE: i32>(a: int32x2_t, b: uint8x8_t, c: int8x8_t) -> int32x2_t {
16824+
static_assert_uimm_bits!(LANE, 1);
16825+
let c: int32x2_t = transmute(c);
16826+
let c: int32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
16827+
vusdot_s32(a, b, transmute(c))
16828+
}
16829+
16830+
/// Dot product index form with unsigned and signed integers
16831+
///
16832+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vusdotq_lane_s32)
16833+
#[inline]
16834+
#[target_feature(enable = "neon,i8mm")]
16835+
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16836+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusdot, LANE = 0))]
16837+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usdot, LANE = 0))]
16838+
#[rustc_legacy_const_generics(3)]
16839+
pub unsafe fn vusdotq_lane_s32<const LANE: i32>(a: int32x4_t, b: uint8x16_t, c: int8x8_t) -> int32x4_t {
16840+
static_assert_uimm_bits!(LANE, 1);
16841+
let c: int32x2_t = transmute(c);
16842+
let c: int32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
16843+
vusdotq_s32(a, b, transmute(c))
16844+
}
16845+
16846+
/// Dot product index form with signed and unsigned integers
16847+
///
16848+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudot_lane_s32)
16849+
#[inline]
16850+
#[target_feature(enable = "neon,i8mm")]
16851+
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16852+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
16853+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
16854+
#[rustc_legacy_const_generics(3)]
16855+
pub unsafe fn vsudot_lane_s32<const LANE: i32>(a: int32x2_t, b: int8x8_t, c: uint8x8_t) -> int32x2_t {
16856+
static_assert_uimm_bits!(LANE, 1);
16857+
let c: uint32x2_t = transmute(c);
16858+
let c: uint32x2_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32]);
16859+
vusdot_s32(a, transmute(c), b)
16860+
}
16861+
16862+
/// Dot product index form with signed and unsigned integers
16863+
///
16864+
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsudotq_lane_s32)
16865+
#[inline]
16866+
#[target_feature(enable = "neon,i8mm")]
16867+
#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
16868+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsudot, LANE = 0))]
16869+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sudot, LANE = 0))]
16870+
#[rustc_legacy_const_generics(3)]
16871+
pub unsafe fn vsudotq_lane_s32<const LANE: i32>(a: int32x4_t, b: int8x16_t, c: uint8x8_t) -> int32x4_t {
16872+
static_assert_uimm_bits!(LANE, 1);
16873+
let c: uint32x2_t = transmute(c);
16874+
let c: uint32x4_t = simd_shuffle!(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]);
16875+
vusdotq_s32(a, transmute(c), b)
16876+
}
16877+
1677816878
/// Multiply
1677916879
///
1678016880
/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmul_s8)
@@ -37823,6 +37923,94 @@ mod test {
3782337923
assert_eq!(r, e);
3782437924
}
3782537925

37926+
#[simd_test(enable = "neon,i8mm")]
37927+
unsafe fn test_vusdot_s32() {
37928+
let a: i32x2 = i32x2::new(1000, -4200);
37929+
let b: u8x8 = u8x8::new(100, 205, 110, 195, 120, 185, 130, 175);
37930+
let c: i8x8 = i8x8::new(0, 1, 2, 3, -1, -2, -3, -4);
37931+
let e: i32x2 = i32x2::new(2010, -5780);
37932+
let r: i32x2 = transmute(vusdot_s32(transmute(a), transmute(b), transmute(c)));
37933+
assert_eq!(r, e);
37934+
}
37935+
37936+
#[simd_test(enable = "neon,i8mm")]
37937+
unsafe fn test_vusdotq_s32() {
37938+
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
37939+
let b: u8x16 = u8x16::new(100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135);
37940+
let c: i8x16 = i8x16::new(0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8);
37941+
let e: i32x4 = i32x4::new(2010, -5780, 2370, -1940);
37942+
let r: i32x4 = transmute(vusdotq_s32(transmute(a), transmute(b), transmute(c)));
37943+
assert_eq!(r, e);
37944+
}
37945+
37946+
#[simd_test(enable = "neon,i8mm")]
37947+
unsafe fn test_vusdot_lane_s32() {
37948+
let a: i32x2 = i32x2::new(1000, -4200);
37949+
let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37950+
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37951+
let e: i32x2 = i32x2::new(2100, -2700);
37952+
let r: i32x2 = transmute(vusdot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
37953+
assert_eq!(r, e);
37954+
37955+
let a: i32x2 = i32x2::new(1000, -4200);
37956+
let b: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37957+
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37958+
let e: i32x2 = i32x2::new(260, -5180);
37959+
let r: i32x2 = transmute(vusdot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
37960+
assert_eq!(r, e);
37961+
}
37962+
37963+
#[simd_test(enable = "neon,i8mm")]
37964+
unsafe fn test_vusdotq_lane_s32() {
37965+
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
37966+
let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
37967+
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37968+
let e: i32x4 = i32x4::new(2100, -2700, 900, 4300);
37969+
let r: i32x4 = transmute(vusdotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
37970+
assert_eq!(r, e);
37971+
37972+
let a: i32x4 = i32x4::new(1000, -4200, -1000, 2000);
37973+
let b: u8x16 = u8x16::new(100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250);
37974+
let c: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37975+
let e: i32x4 = i32x4::new(260, -5180, -2220, 540);
37976+
let r: i32x4 = transmute(vusdotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
37977+
assert_eq!(r, e);
37978+
}
37979+
37980+
#[simd_test(enable = "neon,i8mm")]
37981+
unsafe fn test_vsudot_lane_s32() {
37982+
let a: i32x2 = i32x2::new(-2000, 4200);
37983+
let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37984+
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37985+
let e: i32x2 = i32x2::new(-900, 3460);
37986+
let r: i32x2 = transmute(vsudot_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
37987+
assert_eq!(r, e);
37988+
37989+
let a: i32x2 = i32x2::new(-2000, 4200);
37990+
let b: i8x8 = i8x8::new(4, 3, 2, 1, 0, -1, -2, -3);
37991+
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
37992+
let e: i32x2 = i32x2::new(-500, 3220);
37993+
let r: i32x2 = transmute(vsudot_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
37994+
assert_eq!(r, e);
37995+
}
37996+
37997+
#[simd_test(enable = "neon,i8mm")]
37998+
unsafe fn test_vsudotq_lane_s32() {
37999+
let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
38000+
let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
38001+
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
38002+
let e: i32x4 = i32x4::new(-900, 3460, -3580, -2420);
38003+
let r: i32x4 = transmute(vsudotq_lane_s32::<0>(transmute(a), transmute(b), transmute(c)));
38004+
assert_eq!(r, e);
38005+
38006+
let a: i32x4 = i32x4::new(-2000, 4200, -1000, 2000);
38007+
let b: i8x16 = i8x16::new(4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
38008+
let c: u8x8 = u8x8::new(100, 110, 120, 130, 140, 150, 160, 170);
38009+
let e: i32x4 = i32x4::new(-500, 3220, -4460, -3940);
38010+
let r: i32x4 = transmute(vsudotq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
38011+
assert_eq!(r, e);
38012+
}
38013+
3782638014
#[simd_test(enable = "neon")]
3782738015
unsafe fn test_vmul_s8() {
3782838016
let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2);

crates/intrinsic-test/missing_aarch64.txt

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,6 @@ vbfmlaltq_f32
1212
vbfmlaltq_lane_f32
1313
vbfmlaltq_laneq_f32
1414
vbfmmlaq_f32
15-
vsudot_laneq_s32
16-
vsudot_lane_s32
17-
vsudotq_laneq_s32
18-
vsudotq_lane_s32
19-
vusdot_laneq_s32
20-
vusdot_lane_s32
21-
vusdotq_laneq_s32
22-
vusdotq_lane_s32
23-
vusdotq_s32
24-
vusdot_s32
2515

2616

2717
# Missing from both Clang and stdarch

crates/intrinsic-test/missing_arm.txt

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,6 @@ vbfmlaltq_f32
1212
vbfmlaltq_lane_f32
1313
vbfmlaltq_laneq_f32
1414
vbfmmlaq_f32
15-
vsudot_laneq_s32
16-
vsudot_lane_s32
17-
vsudotq_laneq_s32
18-
vsudotq_lane_s32
19-
vusdot_laneq_s32
20-
vusdot_lane_s32
21-
vusdotq_laneq_s32
22-
vusdotq_lane_s32
23-
vusdotq_s32
24-
vusdot_s32
2515

2616
# Implemented in Clang and stdarch for A64 only even though CSV claims A32 support
2717
__crc32d

0 commit comments

Comments
 (0)