add neon instruction vfma_n_* (#1122)

surechen · web-flow · commit 796bfdf9097a · 2021-04-17T17:45:54.000+01:00
diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -2810,6 +2810,24 @@ pub unsafe fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float
     vfmaq_f64_(a, b, c)
 }
 
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmadd))]
+pub unsafe fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t {
+    let d: float64x1_t = transmute(f64x1::new(c));
+    vfma_f64(b, transmute(d), a)
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fmla))]
+pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t {
+    let d: float64x2_t = transmute(f64x2::new(c, c));
+    vfmaq_f64(b, d, a)
+}
+
 /// Divide
 #[inline]
 #[target_feature(enable = "neon")]
@@ -8232,6 +8250,26 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_n_f64() {
+        let a: f64 = 2.0;
+        let b: f64 = 6.0;
+        let c: f64 = 8.0;
+        let e: f64 = 50.0;
+        let r: f64 = transmute(vfma_n_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_n_f64() {
+        let a: f64x2 = f64x2::new(2.0, 3.0);
+        let b: f64x2 = f64x2::new(6.0, 4.0);
+        let c: f64 = 8.0;
+        let e: f64x2 = f64x2::new(50.0, 35.0);
+        let r: f64x2 = transmute(vfmaq_n_f64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vdiv_f32() {
         let a: f32x2 = f32x2::new(2.0, 6.0);
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -4738,6 +4738,28 @@ pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
 vfmaq_f32_(a, b, c)
 }
 
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    let d: float32x2_t = transmute(f32x2::new(c, c));
+    vfma_f32(b, d, a)
+}
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
+    vfmaq_f32(b, d, a)
+}
+
 /// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
@@ -13974,6 +13996,26 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfma_n_f32() {
+        let a: f32x2 = f32x2::new(2.0, 3.0);
+        let b: f32x2 = f32x2::new(6.0, 4.0);
+        let c: f32 = 8.0;
+        let e: f32x2 = f32x2::new(50.0, 35.0);
+        let r: f32x2 = transmute(vfma_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vfmaq_n_f32() {
+        let a: f32x4 = f32x4::new(2.0, 3.0, 4.0, 5.0);
+        let b: f32x4 = f32x4::new(6.0, 4.0, 7.0, 8.0);
+        let c: f32 = 8.0;
+        let e: f32x4 = f32x4::new(50.0, 35.0, 60.0, 69.0);
+        let r: f32x4 = transmute(vfmaq_n_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vsub_s8() {
         let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
@@ -1566,6 +1566,62 @@ link-arm = llvm.fma._EXT_
 link-aarch64 = llvm.fma._EXT_
 generate float*_t
 
+/// Floating-point fused Multiply-Add to accumulator(vector)
+name = vfma
+n-suffix
+multi_fn = transmute, d:in_t, {f64x1::new, c}
+multi_fn = vfma-self-noext, b, transmute(d), a
+a = 2.0, 3.0, 4.0, 5.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 8.0
+validate 50.0, 35.0, 60.0, 69.0
+
+aarch64 = fmadd
+generate float64x1_t:float64x1_t:f64:float64x1_t
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+name = vfma
+n-suffix
+multi_fn = transmute, d:in_t, {f64x2::new, c, c}
+multi_fn = vfma-self-noext, b, d, a
+a = 2.0, 3.0, 4.0, 5.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 8.0
+validate 50.0, 35.0, 60.0, 69.0
+
+aarch64 = fmla
+generate float64x2_t:float64x2_t:f64:float64x2_t
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+name = vfma
+n-suffix
+multi_fn = transmute, d:in_t, {f32x2::new, c, c}
+multi_fn = vfma-self-noext, b, d, a
+a = 2.0, 3.0, 4.0, 5.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 8.0
+validate 50.0, 35.0, 60.0, 69.0
+
+target = fp-armv8
+arm = vfma
+aarch64 = fmla
+generate float32x2_t:float32x2_t:f32:float32x2_t
+
+/// Floating-point fused Multiply-Add to accumulator(vector)
+name = vfma
+n-suffix
+multi_fn = transmute, d:in_t, {f32x4::new, c, c, c, c}
+multi_fn = vfma-self-noext, b, d, a
+a = 2.0, 3.0, 4.0, 5.0
+b = 6.0, 4.0, 7.0, 8.0
+c = 8.0
+validate 50.0, 35.0, 60.0, 69.0
+
+target = fp-armv8
+arm = vfma
+aarch64 = fmla
+generate float32x4_t:float32x4_t:f32:float32x4_t
+
 /// Divide
 name = vdiv
 fn = simd_div
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
@@ -120,9 +120,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
                 // Intrinsics using `cvtpi2ps` are typically "composites" and
                 // in some cases exceed the limit.
                 "cvtpi2ps" => 25,
-
                 // core_arch/src/arm_shared/simd32
-                "usad8" => 27,
+                // vfmaq_n_f32_vfma : #instructions = 26 >= 22 (limit)
+                "usad8" | "vfma" => 27,
                 "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29,
 
                 // Original limit was 20 instructions, but ARM DSP Intrinsics