Skip to content

Commit 2760409

Browse files
paolotetialexcrichton
authored andcommitted
ARM DSP: Halving parallel add/sub and multiply add/sub (rust-lang#535)
* ARM DSP: Add signed halving parallel sub. Add: - `shsub8`: Signed halving parallel byte-wise subtraction. - `shsub16`: Signed halving parallel halfword-wise subtraction. * ARM DSP: Signed halving parallel additions. - `shadd8`: Signed halving parallel byte-wise add. - `shadd16`: Signed halving parallel halfword-wise add. * ARM DSP: Signed Dual Multiply Add and Signed Dual Multiply Sub. - `SMUAD`: Signed Dual Multiply Add. - `SMUADX`: Signed Dual Multiply Add Reversed. - `SMUSD`: Signed Dual Multiply Subtract. - `SMUSDX`: Signed Dual Multiply Subtract Reversed. * ARM DSP: Restrict to Cortex-A and Cortex-R Restrict everything to Cortex-A/R till We found a better way manage thumb* targets. Add 'dox' to generate docs. * ARM DSP: fix Markdown documentation Quote '[' and ']' where are not part of the Markdown syntax.
1 parent bd6254f commit 2760409

File tree

2 files changed

+214
-3
lines changed

2 files changed

+214
-3
lines changed

coresimd/arm/dsp.rs

+212-1
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,32 @@ extern "C" {
5454
#[link_name = "llvm.arm.sasx"]
5555
fn arm_sasx(a: i32, b: i32) -> i32;
5656

57-
#[cfg_attr(not(target_feature = "mclass"), link_name = "llvm.arm.sel")]
57+
#[link_name = "llvm.arm.sel"]
5858
fn arm_sel(a: i32, b: i32) -> i32;
59+
60+
#[link_name = "llvm.arm.shadd8"]
61+
fn arm_shadd8(a: i32, b: i32) -> i32;
62+
63+
#[link_name = "llvm.arm.shadd16"]
64+
fn arm_shadd16(a: i32, b: i32) -> i32;
65+
66+
#[link_name = "llvm.arm.shsub8"]
67+
fn arm_shsub8(a: i32, b: i32) -> i32;
68+
69+
#[link_name = "llvm.arm.shsub16"]
70+
fn arm_shsub16(a: i32, b: i32) -> i32;
71+
72+
#[link_name = "llvm.arm.smuad"]
73+
fn arm_smuad(a: i32, b: i32) -> i32;
74+
75+
#[link_name = "llvm.arm.smuadx"]
76+
fn arm_smuadx(a: i32, b: i32) -> i32;
77+
78+
#[link_name = "llvm.arm.smusd"]
79+
fn arm_smusd(a: i32, b: i32) -> i32;
80+
81+
#[link_name = "llvm.arm.smusdx"]
82+
fn arm_smusdx(a: i32, b: i32) -> i32;
5983
}
6084

6185
/// Signed saturating addition
@@ -201,6 +225,109 @@ pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
201225
dsp_call!(arm_sel, a, b)
202226
}
203227

228+
/// Signed halving parallel byte-wise addition.
229+
///
230+
/// Returns the 8-bit signed equivalent of
231+
///
232+
/// res\[0\] = (a\[0\] + b\[0\]) / 2
233+
/// res\[1\] = (a\[1\] + b\[1\]) / 2
234+
/// res\[2\] = (a\[2\] + b\[2\]) / 2
235+
/// res\[3\] = (a\[3\] + b\[3\]) / 2
236+
#[inline]
237+
#[cfg_attr(test, assert_instr(shadd8))]
238+
pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
239+
dsp_call!(arm_shadd8, a, b)
240+
}
241+
242+
/// Signed halving parallel halfword-wise addition.
243+
///
244+
/// Returns the 16-bit signed equivalent of
245+
///
246+
/// res\[0\] = (a\[0\] + b\[0\]) / 2
247+
/// res\[1\] = (a\[1\] + b\[1\]) / 2
248+
#[inline]
249+
#[cfg_attr(test, assert_instr(shadd16))]
250+
pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
251+
dsp_call!(arm_shadd16, a, b)
252+
}
253+
254+
/// Signed halving parallel byte-wise subtraction.
255+
///
256+
/// Returns the 8-bit signed equivalent of
257+
///
258+
/// res\[0\] = (a\[0\] - b\[0\]) / 2
259+
/// res\[1\] = (a\[1\] - b\[1\]) / 2
260+
/// res\[2\] = (a\[2\] - b\[2\]) / 2
261+
/// res\[3\] = (a\[3\] - b\[3\]) / 2
262+
#[inline]
263+
#[cfg_attr(test, assert_instr(shsub8))]
264+
pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
265+
dsp_call!(arm_shsub8, a, b)
266+
}
267+
268+
/// Signed halving parallel halfword-wise subtraction.
269+
///
270+
/// Returns the 16-bit signed equivalent of
271+
///
272+
/// res\[0\] = (a\[0\] - b\[0\]) / 2
273+
/// res\[1\] = (a\[1\] - b\[1\]) / 2
274+
#[inline]
275+
#[cfg_attr(test, assert_instr(shsub16))]
276+
pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
277+
dsp_call!(arm_shsub16, a, b)
278+
}
279+
280+
/// Signed Dual Multiply Add.
281+
///
282+
/// Returns the equivalent of
283+
///
284+
/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
285+
///
286+
/// and sets the Q flag if overflow occurs on the addition.
287+
#[cfg_attr(test, assert_instr(smuad))]
288+
pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
289+
arm_smuad(::mem::transmute(a), ::mem::transmute(b))
290+
}
291+
292+
/// Signed Dual Multiply Add Reversed.
293+
///
294+
/// Returns the equivalent of
295+
///
296+
/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
297+
///
298+
/// and sets the Q flag if overflow occurs on the addition.
299+
#[inline]
300+
#[cfg_attr(test, assert_instr(smuadx))]
301+
pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
302+
arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
303+
}
304+
305+
/// Signed Dual Multiply Subtract.
306+
///
307+
/// Returns the equivalent of
308+
///
309+
/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
310+
///
311+
/// and sets the Q flag if overflow occurs on the addition.
312+
#[inline]
313+
#[cfg_attr(test, assert_instr(smusd))]
314+
pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
315+
arm_smusd(::mem::transmute(a), ::mem::transmute(b))
316+
}
317+
318+
/// Signed Dual Multiply Subtract Reversed.
319+
///
320+
/// Returns the equivalent of
321+
///
322+
/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
323+
///
324+
/// and sets the Q flag if overflow occurs on the addition.
325+
#[inline]
326+
#[cfg_attr(test, assert_instr(smusdx))]
327+
pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
328+
arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
329+
}
330+
204331
#[cfg(test)]
205332
mod tests {
206333
use coresimd::arm::*;
@@ -337,4 +464,88 @@ mod tests {
337464
assert_eq!(r, c);
338465
}
339466
}
467+
468+
#[test]
469+
fn shadd8() {
470+
unsafe {
471+
let a = i8x4::new(1, 2, 3, 4);
472+
let b = i8x4::new(5, 4, 3, 2);
473+
let c = i8x4::new(3, 3, 3, 3);
474+
let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
475+
assert_eq!(r, c);
476+
}
477+
}
478+
479+
#[test]
480+
fn shadd16() {
481+
unsafe {
482+
let a = i16x2::new(1, 2);
483+
let b = i16x2::new(5, 4);
484+
let c = i16x2::new(3, 3);
485+
let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
486+
assert_eq!(r, c);
487+
}
488+
}
489+
490+
#[test]
491+
fn shsub8() {
492+
unsafe {
493+
let a = i8x4::new(1, 2, 3, 4);
494+
let b = i8x4::new(5, 4, 3, 2);
495+
let c = i8x4::new(-2, -1, 0, 1);
496+
let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
497+
assert_eq!(r, c);
498+
}
499+
}
500+
501+
#[test]
502+
fn shsub16() {
503+
unsafe {
504+
let a = i16x2::new(1, 2);
505+
let b = i16x2::new(5, 4);
506+
let c = i16x2::new(-2, -1);
507+
let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
508+
assert_eq!(r, c);
509+
}
510+
}
511+
512+
#[test]
513+
fn smuad() {
514+
unsafe {
515+
let a = i16x2::new(1, 2);
516+
let b = i16x2::new(5, 4);
517+
let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
518+
assert_eq!(r, 13);
519+
}
520+
}
521+
522+
#[test]
523+
fn smuadx() {
524+
unsafe {
525+
let a = i16x2::new(1, 2);
526+
let b = i16x2::new(5, 4);
527+
let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
528+
assert_eq!(r, 14);
529+
}
530+
}
531+
532+
#[test]
533+
fn smusd() {
534+
unsafe {
535+
let a = i16x2::new(1, 2);
536+
let b = i16x2::new(5, 4);
537+
let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
538+
assert_eq!(r, -3);
539+
}
540+
}
541+
542+
#[test]
543+
fn smusdx() {
544+
unsafe {
545+
let a = i16x2::new(1, 2);
546+
let b = i16x2::new(5, 4);
547+
let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
548+
assert_eq!(r, -6);
549+
}
550+
}
340551
}

coresimd/arm/mod.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ mod v7;
2020
#[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
2121
pub use self::v7::*;
2222

23-
#[cfg(all(target_arch = "arm", target_feature = "v7"))]
23+
#[cfg(all(target_feature = "v7", not(target_feature = "mclass")))]
2424
mod dsp;
25-
#[cfg(all(target_arch = "arm", target_feature = "v7"))]
25+
#[cfg(all(target_feature = "v7", not(target_feature = "mclass")))]
2626
pub use self::dsp::*;
2727

2828
// NEON is supported on AArch64, and on ARM when built with the v7 and neon

0 commit comments

Comments
 (0)