Skip to content

Commit 796bfdf

Browse files
authored
add neon instruction vfma_n_* (#1122)
1 parent 3fd6dd1 commit 796bfdf

File tree

4 files changed

+138
-2
lines changed

4 files changed

+138
-2
lines changed

crates/core_arch/src/aarch64/neon/generated.rs

+38
Original file line numberDiff line numberDiff line change
@@ -2810,6 +2810,24 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
28102810
vfmaq_f64_(a, b, c)
28112811
}
28122812

2813+
/// Floating-point fused Multiply-Add to accumulator(vector)
2814+
#[inline]
2815+
#[target_feature(enable = "neon")]
2816+
#[cfg_attr(test, assert_instr(fmadd))]
2817+
pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
2818+
let d: float64x1_t = transmute(f64x1::new(c));
2819+
vfma_f64(b, transmute(d), a)
2820+
}
2821+
2822+
/// Floating-point fused Multiply-Add to accumulator(vector)
2823+
#[inline]
2824+
#[target_feature(enable = "neon")]
2825+
#[cfg_attr(test, assert_instr(fmla))]
2826+
pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
2827+
let d: float64x2_t = transmute(f64x2::new(c, c));
2828+
vfmaq_f64(b, d, a)
2829+
}
2830+
28132831
/// Divide
28142832
#[inline]
28152833
#[target_feature(enable = "neon")]
@@ -8232,6 +8250,26 @@ mod test {
82328250
assert_eq!(r, e);
82338251
}
82348252

8253+
#[simd_test(enable = "neon")]
8254+
unsafe fn test_vfma_n_f64() {
8255+
let a: f64 = 2.0;
8256+
let b: f64 = 6.0;
8257+
let c: f64 = 8.0;
8258+
let e: f64 = 50.0;
8259+
let r: f64 = transmute(vfma_n_f64(transmute(a), transmute(b), transmute(c)));
8260+
assert_eq!(r, e);
8261+
}
8262+
8263+
#[simd_test(enable = "neon")]
8264+
unsafe fn test_vfmaq_n_f64() {
8265+
let a: f64x2 = f64x2::new(2.0, 3.0);
8266+
let b: f64x2 = f64x2::new(6.0, 4.0);
8267+
let c: f64 = 8.0;
8268+
let e: f64x2 = f64x2::new(50.0, 35.0);
8269+
let r: f64x2 = transmute(vfmaq_n_f64(transmute(a), transmute(b), transmute(c)));
8270+
assert_eq!(r, e);
8271+
}
8272+
82358273
#[simd_test(enable = "neon")]
82368274
unsafe fn test_vdiv_f32() {
82378275
let a: f32x2 = f32x2::new(2.0, 6.0);

crates/core_arch/src/arm_shared/neon/generated.rs

+42
Original file line numberDiff line numberDiff line change
@@ -4738,6 +4738,28 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
47384738
vfmaq_f32_(a, b, c)
47394739
}
47404740

4741+
/// Floating-point fused Multiply-Add to accumulator(vector)
4742+
#[inline]
4743+
#[target_feature(enable = "neon")]
4744+
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
4745+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
4746+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
4747+
pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
4748+
let d: float32x2_t = transmute(f32x2::new(c, c));
4749+
vfma_f32(b, d, a)
4750+
}
4751+
4752+
/// Floating-point fused Multiply-Add to accumulator(vector)
4753+
#[inline]
4754+
#[target_feature(enable = "neon")]
4755+
#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
4756+
#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
4757+
#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
4758+
pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
4759+
let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
4760+
vfmaq_f32(b, d, a)
4761+
}
4762+
47414763
/// Subtract
47424764
#[inline]
47434765
#[target_feature(enable = "neon")]
@@ -13974,6 +13996,26 @@ mod test {
1397413996
assert_eq!(r, e);
1397513997
}
1397613998

13999+
#[simd_test(enable = "neon")]
14000+
unsafe fn test_vfma_n_f32() {
14001+
let a: f32x2 = f32x2::new(2.0, 3.0);
14002+
let b: f32x2 = f32x2::new(6.0, 4.0);
14003+
let c: f32 = 8.0;
14004+
let e: f32x2 = f32x2::new(50.0, 35.0);
14005+
let r: f32x2 = transmute(vfma_n_f32(transmute(a), transmute(b), transmute(c)));
14006+
assert_eq!(r, e);
14007+
}
14008+
14009+
#[simd_test(enable = "neon")]
14010+
unsafe fn test_vfmaq_n_f32() {
14011+
let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
14012+
let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
14013+
let c: f32 = 8.0;
14014+
let e: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
14015+
let r: f32x4 = transmute(vfmaq_n_f32(transmute(a), transmute(b), transmute(c)));
14016+
assert_eq!(r, e);
14017+
}
14018+
1397714019
#[simd_test(enable = "neon")]
1397814020
unsafe fn test_vsub_s8() {
1397914021
let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);

crates/stdarch-gen/neon.spec

+56
Original file line numberDiff line numberDiff line change
@@ -1566,6 +1566,62 @@ link-arm = llvm.fma._EXT_
15661566
link-aarch64 = llvm.fma._EXT_
15671567
generate float*_t
15681568

1569+
/// Floating-point fused Multiply-Add to accumulator(vector)
1570+
name = vfma
1571+
n-suffix
1572+
multi_fn = transmute, d:in_t, {f64x1::new, c}
1573+
multi_fn = vfma-self-noext, b, transmute(d), a
1574+
a = 2.0, 3.0, 4.0, 5.0
1575+
b = 6.0, 4.0, 7.0, 8.0
1576+
c = 8.0
1577+
validate 50.0, 35.0, 60.0, 69.0
1578+
1579+
aarch64 = fmadd
1580+
generate float64x1_t:float64x1_t:f64:float64x1_t
1581+
1582+
/// Floating-point fused Multiply-Add to accumulator(vector)
1583+
name = vfma
1584+
n-suffix
1585+
multi_fn = transmute, d:in_t, {f64x2::new, c, c}
1586+
multi_fn = vfma-self-noext, b, d, a
1587+
a = 2.0, 3.0, 4.0, 5.0
1588+
b = 6.0, 4.0, 7.0, 8.0
1589+
c = 8.0
1590+
validate 50.0, 35.0, 60.0, 69.0
1591+
1592+
aarch64 = fmla
1593+
generate float64x2_t:float64x2_t:f64:float64x2_t
1594+
1595+
/// Floating-point fused Multiply-Add to accumulator(vector)
1596+
name = vfma
1597+
n-suffix
1598+
multi_fn = transmute, d:in_t, {f32x2::new, c, c}
1599+
multi_fn = vfma-self-noext, b, d, a
1600+
a = 2.0, 3.0, 4.0, 5.0
1601+
b = 6.0, 4.0, 7.0, 8.0
1602+
c = 8.0
1603+
validate 50.0, 35.0, 60.0, 69.0
1604+
1605+
target = fp-armv8
1606+
arm = vfma
1607+
aarch64 = fmla
1608+
generate float32x2_t:float32x2_t:f32:float32x2_t
1609+
1610+
/// Floating-point fused Multiply-Add to accumulator(vector)
1611+
name = vfma
1612+
n-suffix
1613+
multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c}
1614+
multi_fn = vfma-self-noext, b, d, a
1615+
a = 2.0, 3.0, 4.0, 5.0
1616+
b = 6.0, 4.0, 7.0, 8.0
1617+
c = 8.0
1618+
validate 50.0, 35.0, 60.0, 69.0
1619+
1620+
target = fp-armv8
1621+
arm = vfma
1622+
aarch64 = fmla
1623+
generate float32x4_t:float32x4_t:f32:float32x4_t
1624+
15691625
/// Divide
15701626
name = vdiv
15711627
fn = simd_div

crates/stdarch-test/src/lib.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
120120
// Intrinsics using `cvtpi2ps` are typically "composites" and
121121
// in some cases exceed the limit.
122122
"cvtpi2ps" => 25,
123-
124123
// core_arch/src/arm_shared/simd32
125-
"usad8" => 27,
124+
// vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
125+
"usad8" | "vfma" => 27,
126126
"qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
127127

128128
// Original limit was 20 instructions, but ARM DSP Intrinsics

0 commit comments

Comments
 (0)