Skip to content

Commit 61a9083

Browse files
gwennalexcrichton
authored andcommitted
sse: __m64 related intrinsics (rust-lang#230)
* sse: add missing aliases _m_pextrw, _m_pinsrw, _m_pmovmskb, _m_pshufw * sse: _mm_maskmove_si64, _m_maskmovq * sse: _mm_mulhi_pu16, _m_pmulhuw * sse: _mm_avg_pu8, _m_pavgb * sse: _mm_avg_pu16, _m_pavgw * sse: _mm_sad_pu8, _m_psadbw * sse: _mm_cvtpi32_ps * sse: _mm_cvtpi32x2_ps
1 parent 22210c2 commit 61a9083

File tree

2 files changed

+260
-10
lines changed

2 files changed

+260
-10
lines changed

coresimd/src/x86/i586/sse.rs

-6
Original file line numberDiff line numberDiff line change
@@ -682,12 +682,6 @@ pub unsafe fn _mm_cvt_si2ss(a: f32x4, b: i32) -> f32x4 {
682682
_mm_cvtsi32_ss(a, b)
683683
}
684684

685-
// Blocked by https://github.com/rust-lang-nursery/stdsimd/issues/74
686-
// pub unsafe fn _mm_cvtpi32_ps(a: f32x4, b: i32x2) -> f32x4
687-
// pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
688-
// _mm_cvtpi32_ps(a, b)
689-
// }
690-
691685
/// Construct a `f32x4` with the lowest element set to `a` and the rest set to
692686
/// zero.
693687
#[inline(always)]

coresimd/src/x86/i686/sse.rs

+260-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! `i686` Streaming SIMD Extensions (SSE)
22
33
use v128::f32x4;
4-
use v64::{i16x4, i32x2, i8x8, u8x8};
4+
use v64::{i16x4, i32x2, i8x8, u16x4, u8x8};
55
use x86::__m64;
66
use core::mem;
77
use x86::i586;
@@ -14,6 +14,8 @@ use stdsimd_test::assert_instr;
1414
extern "C" {
1515
#[link_name = "llvm.x86.sse.cvtpi2ps"]
1616
fn cvtpi2ps(a: f32x4, b: __m64) -> f32x4;
17+
#[link_name = "llvm.x86.mmx.maskmovq"]
18+
fn maskmovq(a: __m64, mask: __m64, mem_addr: *mut i8);
1719
#[link_name = "llvm.x86.mmx.pextr.w"]
1820
fn pextrw(a: __m64, imm8: i32) -> i32;
1921
#[link_name = "llvm.x86.mmx.pinsr.w"]
@@ -30,6 +32,14 @@ extern "C" {
3032
fn pminsw(a: __m64, b: __m64) -> __m64;
3133
#[link_name = "llvm.x86.mmx.pminu.b"]
3234
fn pminub(a: __m64, b: __m64) -> __m64;
35+
#[link_name = "llvm.x86.mmx.pmulhu.w"]
36+
fn pmulhuw(a: __m64, b: __m64) -> __m64;
37+
#[link_name = "llvm.x86.mmx.pavg.b"]
38+
fn pavgb(a: __m64, b: __m64) -> __m64;
39+
#[link_name = "llvm.x86.mmx.pavg.w"]
40+
fn pavgw(a: __m64, b: __m64) -> __m64;
41+
#[link_name = "llvm.x86.mmx.psad.bw"]
42+
fn psadbw(a: __m64, b: __m64) -> __m64;
3343
#[link_name = "llvm.x86.sse.cvtps2pi"]
3444
fn cvtps2pi(a: f32x4) -> __m64;
3545
#[link_name = "llvm.x86.sse.cvttps2pi"]
@@ -108,17 +118,150 @@ pub unsafe fn _m_pminub(a: u8x8, b: u8x8) -> u8x8 {
108118
_mm_min_pu8(a, b)
109119
}
110120

121+
/// Multiplies packed 16-bit unsigned integer values and writes the
122+
/// high-order 16 bits of each 32-bit product to the corresponding bits in
123+
/// the destination.
124+
#[inline(always)]
125+
#[target_feature = "+sse"]
126+
#[cfg_attr(test, assert_instr(pmulhuw))]
127+
pub unsafe fn _mm_mulhi_pu16(a: u16x4, b: u16x4) -> u16x4 {
128+
mem::transmute(pmulhuw(mem::transmute(a), mem::transmute(b)))
129+
}
130+
131+
/// Multiplies packed 16-bit unsigned integer values and writes the
132+
/// high-order 16 bits of each 32-bit product to the corresponding bits in
133+
/// the destination.
134+
#[inline(always)]
135+
#[target_feature = "+sse"]
136+
#[cfg_attr(test, assert_instr(pmulhuw))]
137+
pub unsafe fn _m_pmulhuw(a: u16x4, b: u16x4) -> u16x4 {
138+
_mm_mulhi_pu16(a, b)
139+
}
140+
141+
/// Computes the rounded averages of the packed unsigned 8-bit integer
142+
/// values and writes the averages to the corresponding bits in the
143+
/// destination.
144+
#[inline(always)]
145+
#[target_feature = "+sse"]
146+
#[cfg_attr(test, assert_instr(pavgb))]
147+
pub unsafe fn _mm_avg_pu8(a: u8x8, b: u8x8) -> u8x8 {
148+
mem::transmute(pavgb(mem::transmute(a), mem::transmute(b)))
149+
}
150+
151+
/// Computes the rounded averages of the packed unsigned 8-bit integer
152+
/// values and writes the averages to the corresponding bits in the
153+
/// destination.
154+
#[inline(always)]
155+
#[target_feature = "+sse"]
156+
#[cfg_attr(test, assert_instr(pavgb))]
157+
pub unsafe fn _m_pavgb(a: u8x8, b: u8x8) -> u8x8 {
158+
_mm_avg_pu8(a, b)
159+
}
160+
161+
/// Computes the rounded averages of the packed unsigned 16-bit integer
162+
/// values and writes the averages to the corresponding bits in the
163+
/// destination.
164+
#[inline(always)]
165+
#[target_feature = "+sse"]
166+
#[cfg_attr(test, assert_instr(pavgw))]
167+
pub unsafe fn _mm_avg_pu16(a: u16x4, b: u16x4) -> u16x4 {
168+
mem::transmute(pavgw(mem::transmute(a), mem::transmute(b)))
169+
}
170+
171+
/// Computes the rounded averages of the packed unsigned 16-bit integer
172+
/// values and writes the averages to the corresponding bits in the
173+
/// destination.
174+
#[inline(always)]
175+
#[target_feature = "+sse"]
176+
#[cfg_attr(test, assert_instr(pavgw))]
177+
pub unsafe fn _m_pavgw(a: u16x4, b: u16x4) -> u16x4 {
178+
_mm_avg_pu16(a, b)
179+
}
180+
181+
/// Subtracts the corresponding 8-bit unsigned integer values of the two
182+
/// 64-bit vector operands and computes the absolute value for each of the
183+
/// difference. Then sum of the 8 absolute differences is written to the
184+
/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
185+
#[inline(always)]
186+
#[target_feature = "+sse"]
187+
#[cfg_attr(test, assert_instr(psadbw))]
188+
pub unsafe fn _mm_sad_pu8(a: u8x8, b: u8x8) -> u64 {
189+
mem::transmute(psadbw(mem::transmute(a), mem::transmute(b)))
190+
}
191+
192+
/// Subtracts the corresponding 8-bit unsigned integer values of the two
193+
/// 64-bit vector operands and computes the absolute value for each of the
194+
/// difference. Then sum of the 8 absolute differences is written to the
195+
/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
196+
#[inline(always)]
197+
#[target_feature = "+sse"]
198+
#[cfg_attr(test, assert_instr(psadbw))]
199+
pub unsafe fn _m_psadbw(a: u8x8, b: u8x8) -> u64 {
200+
_mm_sad_pu8(a, b)
201+
}
202+
111203
/// Converts two elements of a 64-bit vector of [2 x i32] into two
112204
/// floating point values and writes them to the lower 64-bits of the
113205
/// destination. The remaining higher order elements of the destination are
114206
/// copied from the corresponding elements in the first operand.
115207
#[inline(always)]
116208
#[target_feature = "+sse"]
117209
#[cfg_attr(test, assert_instr(cvtpi2ps))]
118-
pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
210+
pub unsafe fn _mm_cvtpi32_ps(a: f32x4, b: i32x2) -> f32x4 {
119211
cvtpi2ps(a, mem::transmute(b))
120212
}
121213

214+
/// Converts two elements of a 64-bit vector of [2 x i32] into two
215+
/// floating point values and writes them to the lower 64-bits of the
216+
/// destination. The remaining higher order elements of the destination are
217+
/// copied from the corresponding elements in the first operand.
218+
#[inline(always)]
219+
#[target_feature = "+sse"]
220+
#[cfg_attr(test, assert_instr(cvtpi2ps))]
221+
pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
222+
_mm_cvtpi32_ps(a, b)
223+
}
224+
225+
/// Converts the two 32-bit signed integer values from each 64-bit vector
226+
/// operand of [2 x i32] into a 128-bit vector of [4 x float].
227+
#[inline(always)]
228+
#[target_feature = "+sse"]
229+
#[cfg_attr(test, assert_instr(cvtpi2ps))]
230+
pub unsafe fn _mm_cvtpi32x2_ps(a: i32x2, b: i32x2) -> f32x4 {
231+
let c = i586::_mm_setzero_ps();
232+
let c = _mm_cvtpi32_ps(c, b);
233+
let c = i586::_mm_movelh_ps(c, c);
234+
_mm_cvtpi32_ps(c, a)
235+
}
236+
237+
/// Conditionally copies the values from each 8-bit element in the first
238+
/// 64-bit integer vector operand to the specified memory location, as
239+
/// specified by the most significant bit in the corresponding element in the
240+
/// second 64-bit integer vector operand.
241+
///
242+
/// To minimize caching, the data is flagged as non-temporal
243+
/// (unlikely to be used again soon).
244+
#[inline(always)]
245+
#[target_feature = "+sse"]
246+
#[cfg_attr(test, assert_instr(maskmovq))]
247+
pub unsafe fn _mm_maskmove_si64(a: i8x8, mask: i8x8, mem_addr: *mut i8) {
248+
maskmovq(mem::transmute(a), mem::transmute(mask), mem_addr)
249+
}
250+
251+
/// Conditionally copies the values from each 8-bit element in the first
252+
/// 64-bit integer vector operand to the specified memory location, as
253+
/// specified by the most significant bit in the corresponding element in the
254+
/// second 64-bit integer vector operand.
255+
///
256+
/// To minimize caching, the data is flagged as non-temporal
257+
/// (unlikely to be used again soon).
258+
#[inline(always)]
259+
#[target_feature = "+sse"]
260+
#[cfg_attr(test, assert_instr(maskmovq))]
261+
pub unsafe fn _m_maskmovq(a: i8x8, mask: i8x8, mem_addr: *mut i8) {
262+
_mm_maskmove_si64(a, mask, mem_addr)
263+
}
264+
122265
/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
123266
/// returns it, as specified by the immediate integer operand.
124267
#[inline(always)]
@@ -131,6 +274,15 @@ pub unsafe fn _mm_extract_pi16(a: i16x4, imm2: i32) -> i16 {
131274
constify_imm2!(imm2, call)
132275
}
133276

277+
/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
278+
/// returns it, as specified by the immediate integer operand.
279+
#[inline(always)]
280+
#[target_feature = "+sse"]
281+
#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
282+
pub unsafe fn _m_pextrw(a: i16x4, imm2: i32) -> i16 {
283+
_mm_extract_pi16(a, imm2)
284+
}
285+
134286
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
135287
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
136288
/// specified by the immediate operand `n`.
@@ -144,6 +296,16 @@ pub unsafe fn _mm_insert_pi16(a: i16x4, d: i32, imm2: i32) -> i16x4 {
144296
constify_imm2!(imm2, call)
145297
}
146298

299+
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
300+
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
301+
/// specified by the immediate operand `n`.
302+
#[inline(always)]
303+
#[target_feature = "+sse"]
304+
#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
305+
pub unsafe fn _m_pinsrw(a: i16x4, d: i32, imm2: i32) -> i16x4 {
306+
_mm_insert_pi16(a, d, imm2)
307+
}
308+
147309
/// Takes the most significant bit from each 8-bit element in a 64-bit
148310
/// integer vector to create a 16-bit mask value. Zero-extends the value to
149311
/// 32-bit integer and writes it to the destination.
@@ -154,6 +316,16 @@ pub unsafe fn _mm_movemask_pi8(a: i16x4) -> i32 {
154316
pmovmskb(mem::transmute(a))
155317
}
156318

319+
/// Takes the most significant bit from each 8-bit element in a 64-bit
320+
/// integer vector to create a 16-bit mask value. Zero-extends the value to
321+
/// 32-bit integer and writes it to the destination.
322+
#[inline(always)]
323+
#[target_feature = "+sse"]
324+
#[cfg_attr(test, assert_instr(pmovmskb))]
325+
pub unsafe fn _m_pmovmskb(a: i16x4) -> i32 {
326+
_mm_movemask_pi8(a)
327+
}
328+
157329
/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
158330
/// destination, as specified by the immediate value operand.
159331
#[inline(always)]
@@ -166,6 +338,15 @@ pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i8) -> i16x4 {
166338
constify_imm8!(imm8, call)
167339
}
168340

341+
/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
342+
/// destination, as specified by the immediate value operand.
343+
#[inline(always)]
344+
#[target_feature = "+sse"]
345+
#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
346+
pub unsafe fn _m_pshufw(a: i16x4, imm8: i8) -> i16x4 {
347+
_mm_shuffle_pi16(a, imm8)
348+
}
349+
169350
/// Convert the two lower packed single-precision (32-bit) floating-point
170351
/// elements in `a` to packed 32-bit integers with truncation.
171352
#[inline(always)]
@@ -229,7 +410,7 @@ pub unsafe fn _mm_cvtps_pi8(a: f32x4) -> i8x8 {
229410
#[cfg(test)]
230411
mod tests {
231412
use v128::f32x4;
232-
use v64::{i16x4, i32x2, i8x8, u8x8};
413+
use v64::{i16x4, i32x2, i8x8, u16x4, u8x8};
233414
use x86::i686::sse;
234415
use stdsimd_test::simd_test;
235416

@@ -274,21 +455,87 @@ mod tests {
274455
}
275456

276457
#[simd_test = "sse"]
277-
unsafe fn _mm_cvt_pi2ps() {
458+
unsafe fn _mm_mulhi_pu16() {
459+
let (a, b) = (u16x4::splat(1000), u16x4::splat(1001));
460+
let r = sse::_mm_mulhi_pu16(a, b);
461+
assert_eq!(r, u16x4::splat(15));
462+
}
463+
464+
#[simd_test = "sse"]
465+
unsafe fn _mm_avg_pu8() {
466+
let (a, b) = (u8x8::splat(3), u8x8::splat(9));
467+
let r = sse::_mm_avg_pu8(a, b);
468+
assert_eq!(r, u8x8::splat(6));
469+
470+
let r = sse::_m_pavgb(a, b);
471+
assert_eq!(r, u8x8::splat(6));
472+
}
473+
474+
#[simd_test = "sse"]
475+
unsafe fn _mm_avg_pu16() {
476+
let (a, b) = (u16x4::splat(3), u16x4::splat(9));
477+
let r = sse::_mm_avg_pu16(a, b);
478+
assert_eq!(r, u16x4::splat(6));
479+
480+
let r = sse::_m_pavgw(a, b);
481+
assert_eq!(r, u16x4::splat(6));
482+
}
483+
484+
#[simd_test = "sse"]
485+
unsafe fn _mm_sad_pu8() {
486+
let a = u8x8::new(255, 254, 253, 252, 1, 2, 3, 4);
487+
let b = u8x8::new(0, 0, 0, 0, 2, 1, 2, 1);
488+
let r = sse::_mm_sad_pu8(a, b);
489+
assert_eq!(r, 1020);
490+
491+
let r = sse::_m_psadbw(a, b);
492+
assert_eq!(r, 1020);
493+
}
494+
495+
#[simd_test = "sse"]
496+
unsafe fn _mm_cvtpi32_ps() {
278497
let a = f32x4::new(0., 0., 3., 4.);
279498
let b = i32x2::new(1, 2);
280499
let expected = f32x4::new(1., 2., 3., 4.);
500+
let r = sse::_mm_cvtpi32_ps(a, b);
501+
assert_eq!(r, expected);
502+
281503
let r = sse::_mm_cvt_pi2ps(a, b);
282504
assert_eq!(r, expected);
283505
}
284506

507+
#[simd_test = "sse"]
508+
unsafe fn _mm_cvtpi32x2_ps() {
509+
let a = i32x2::new(1, 2);
510+
let b = i32x2::new(3, 4);
511+
let expected = f32x4::new(1., 2., 3., 4.);
512+
let r = sse::_mm_cvtpi32x2_ps(a, b);
513+
assert_eq!(r, expected);
514+
}
515+
516+
#[simd_test = "sse"]
517+
unsafe fn _mm_maskmove_si64() {
518+
let a = i8x8::splat(9);
519+
let mask = i8x8::splat(0).replace(2, 0x80u8 as i8);
520+
let mut r = i8x8::splat(0);
521+
sse::_mm_maskmove_si64(a, mask, &mut r as *mut _ as *mut i8);
522+
assert_eq!(r, i8x8::splat(0).replace(2, 9));
523+
524+
let mut r = i8x8::splat(0);
525+
sse::_m_maskmovq(a, mask, &mut r as *mut _ as *mut i8);
526+
assert_eq!(r, i8x8::splat(0).replace(2, 9));
527+
}
528+
285529
#[simd_test = "sse"]
286530
unsafe fn _mm_extract_pi16() {
287531
let a = i16x4::new(1, 2, 3, 4);
288532
let r = sse::_mm_extract_pi16(a, 0);
289533
assert_eq!(r, 1);
290534
let r = sse::_mm_extract_pi16(a, 1);
291535
assert_eq!(r, 2);
536+
537+
let r = sse::_m_pextrw(a, 1);
538+
assert_eq!(r, 2);
292539
}
293540

294541
#[simd_test = "sse"]
@@ -300,13 +547,19 @@ mod tests {
300547
let r = sse::_mm_insert_pi16(a, 0, 0b10);
301548
let expected = i16x4::new(1, 2, 0, 4);
302549
assert_eq!(r, expected);
550+
551+
let r = sse::_m_pinsrw(a, 0, 0b10);
552+
assert_eq!(r, expected);
303553
}
304554

305555
#[simd_test = "sse"]
306556
unsafe fn _mm_movemask_pi8() {
307557
let a = i16x4::new(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
308558
let r = sse::_mm_movemask_pi8(a);
309559
assert_eq!(r, 0b10001);
560+
561+
let r = sse::_m_pmovmskb(a);
562+
assert_eq!(r, 0b10001);
310563
}
311564

312565
#[simd_test = "sse"]
@@ -315,6 +568,9 @@ mod tests {
315568
let r = sse::_mm_shuffle_pi16(a, 0b00_01_01_11);
316569
let expected = i16x4::new(4, 2, 2, 1);
317570
assert_eq!(r, expected);
571+
572+
let r = sse::_m_pshufw(a, 0b00_01_01_11);
573+
assert_eq!(r, expected);
318574
}
319575

320576
#[simd_test = "sse"]

0 commit comments

Comments
 (0)