ARM DSP: Halving parallel add/sub and multiply add/sub (rust-lang#535)

paoloteti · alexcrichton · commit 276040924d48 · 2018-07-23T16:32:05.000-05:00
* ARM DSP: Add signed halving parallel sub.

Add:

- `shsub8`: Signed halving parallel byte-wise subtraction.
- `shsub16`: Signed halving parallel halfword-wise subtraction.

* ARM DSP: Signed halving parallel additions.

- `shadd8`: Signed halving parallel byte-wise add.
- `shadd16`: Signed halving parallel halfword-wise add.

* ARM DSP: Signed Dual Multiply Add and Signed Dual Multiply Sub.

- `SMUAD`: Signed Dual Multiply Add.
- `SMUADX`: Signed Dual Multiply Add Reversed.
- `SMUSD`: Signed Dual Multiply Subtract.
- `SMUSDX`: Signed Dual Multiply Subtract Reversed.

* ARM DSP: Restrict to Cortex-A and Cortex-R

Restrict everything to Cortex-A/R till We found a better way manage
thumb* targets.

Add 'dox' to generate docs.

* ARM DSP: fix Markdown documentation

Quote '[' and ']' where are not part of the Markdown syntax.
diff --git a/coresimd/arm/dsp.rs b/coresimd/arm/dsp.rs
@@ -54,8 +54,32 @@ extern "C" {
     #[link_name = "llvm.arm.sasx"]
     fn arm_sasx(a: i32, b: i32) -> i32;
 
-    #[cfg_attr(not(target_feature = "mclass"), link_name = "llvm.arm.sel")]
+    #[link_name = "llvm.arm.sel"]
     fn arm_sel(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd8"]
+    fn arm_shadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd16"]
+    fn arm_shadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub8"]
+    fn arm_shsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub16"]
+    fn arm_shsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuad"]
+    fn arm_smuad(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuadx"]
+    fn arm_smuadx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusd"]
+    fn arm_smusd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusdx"]
+    fn arm_smusdx(a: i32, b: i32) -> i32;
 }
 
 /// Signed saturating addition
@@ -201,6 +225,109 @@ pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_sel, a, b)
 }
 
+/// Signed halving parallel byte-wise addition.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+/// res\[2\] = (a\[2\] + b\[2\]) / 2
+/// res\[3\] = (a\[3\] + b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd8))]
+pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shadd8, a, b)
+}
+
+/// Signed halving parallel halfword-wise addition.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd16))]
+pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shadd16, a, b)
+}
+
+/// Signed halving parallel byte-wise subtraction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+/// res\[2\] = (a\[2\] - b\[2\]) / 2
+/// res\[3\] = (a\[3\] - b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub8))]
+pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shsub8, a, b)
+}
+
+/// Signed halving parallel halfword-wise subtraction.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub16))]
+pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shsub16, a, b)
+}
+
+/// Signed Dual Multiply Add.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[cfg_attr(test, assert_instr(smuad))]
+pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuad(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Add Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuadx))]
+pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Subtract.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusd))]
+pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusd(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Subtract Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusdx))]
+pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
+}
+
 #[cfg(test)]
 mod tests {
     use coresimd::arm::*;
@@ -337,4 +464,88 @@ mod tests {
             assert_eq!(r, c);
         }
     }
+
+    #[test]
+    fn shadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(3, 3, 3, 3);
+            let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(3, 3);
+            let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-2, -1, 0, 1);
+            let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(-2, -1);
+            let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smuad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 13);
+        }
+    }
+
+    #[test]
+    fn smuadx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 14);
+        }
+    }
+
+    #[test]
+    fn smusd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, -3);
+        }
+    }
+
+    #[test]
+    fn smusdx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, -6);
+        }
+    }
 }
diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs
@@ -20,9 +20,9 @@ mod v7;
 #[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 pub use self::v7::*;
 
-#[cfg(all(target_arch = "arm", target_feature = "v7"))]
+#[cfg(all(target_feature = "v7", not(target_feature = "mclass")))]
 mod dsp;
-#[cfg(all(target_arch = "arm", target_feature = "v7"))]
+#[cfg(all(target_feature = "v7", not(target_feature = "mclass")))]
 pub use self::dsp::*;
 
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon