1
1
//! `i686` Streaming SIMD Extensions (SSE)
2
2
3
3
use v128:: f32x4;
4
- use v64:: { i16x4, i32x2, i8x8, u8x8} ;
4
+ use v64:: { i16x4, i32x2, i8x8, u16x4 , u8x8} ;
5
5
use x86:: __m64;
6
6
use core:: mem;
7
7
use x86:: i586;
@@ -14,6 +14,8 @@ use stdsimd_test::assert_instr;
14
14
extern "C" {
15
15
#[ link_name = "llvm.x86.sse.cvtpi2ps" ]
16
16
fn cvtpi2ps ( a : f32x4 , b : __m64 ) -> f32x4 ;
17
+ #[ link_name = "llvm.x86.mmx.maskmovq" ]
18
+ fn maskmovq ( a : __m64 , mask : __m64 , mem_addr : * mut i8 ) ;
17
19
#[ link_name = "llvm.x86.mmx.pextr.w" ]
18
20
fn pextrw ( a : __m64 , imm8 : i32 ) -> i32 ;
19
21
#[ link_name = "llvm.x86.mmx.pinsr.w" ]
@@ -30,6 +32,14 @@ extern "C" {
30
32
fn pminsw ( a : __m64 , b : __m64 ) -> __m64 ;
31
33
#[ link_name = "llvm.x86.mmx.pminu.b" ]
32
34
fn pminub ( a : __m64 , b : __m64 ) -> __m64 ;
35
+ #[ link_name = "llvm.x86.mmx.pmulhu.w" ]
36
+ fn pmulhuw ( a : __m64 , b : __m64 ) -> __m64 ;
37
+ #[ link_name = "llvm.x86.mmx.pavg.b" ]
38
+ fn pavgb ( a : __m64 , b : __m64 ) -> __m64 ;
39
+ #[ link_name = "llvm.x86.mmx.pavg.w" ]
40
+ fn pavgw ( a : __m64 , b : __m64 ) -> __m64 ;
41
+ #[ link_name = "llvm.x86.mmx.psad.bw" ]
42
+ fn psadbw ( a : __m64 , b : __m64 ) -> __m64 ;
33
43
#[ link_name = "llvm.x86.sse.cvtps2pi" ]
34
44
fn cvtps2pi ( a : f32x4 ) -> __m64 ;
35
45
#[ link_name = "llvm.x86.sse.cvttps2pi" ]
@@ -108,17 +118,150 @@ pub unsafe fn _m_pminub(a: u8x8, b: u8x8) -> u8x8 {
108
118
_mm_min_pu8 ( a, b)
109
119
}
110
120
121
+ /// Multiplies packed 16-bit unsigned integer values and writes the
122
+ /// high-order 16 bits of each 32-bit product to the corresponding bits in
123
+ /// the destination.
124
+ #[ inline( always) ]
125
+ #[ target_feature = "+sse" ]
126
+ #[ cfg_attr( test, assert_instr( pmulhuw) ) ]
127
+ pub unsafe fn _mm_mulhi_pu16 ( a : u16x4 , b : u16x4 ) -> u16x4 {
128
+ mem:: transmute ( pmulhuw ( mem:: transmute ( a) , mem:: transmute ( b) ) )
129
+ }
130
+
131
+ /// Multiplies packed 16-bit unsigned integer values and writes the
132
+ /// high-order 16 bits of each 32-bit product to the corresponding bits in
133
+ /// the destination.
134
+ #[ inline( always) ]
135
+ #[ target_feature = "+sse" ]
136
+ #[ cfg_attr( test, assert_instr( pmulhuw) ) ]
137
+ pub unsafe fn _m_pmulhuw ( a : u16x4 , b : u16x4 ) -> u16x4 {
138
+ _mm_mulhi_pu16 ( a, b)
139
+ }
140
+
141
+ /// Computes the rounded averages of the packed unsigned 8-bit integer
142
+ /// values and writes the averages to the corresponding bits in the
143
+ /// destination.
144
+ #[ inline( always) ]
145
+ #[ target_feature = "+sse" ]
146
+ #[ cfg_attr( test, assert_instr( pavgb) ) ]
147
+ pub unsafe fn _mm_avg_pu8 ( a : u8x8 , b : u8x8 ) -> u8x8 {
148
+ mem:: transmute ( pavgb ( mem:: transmute ( a) , mem:: transmute ( b) ) )
149
+ }
150
+
151
+ /// Computes the rounded averages of the packed unsigned 8-bit integer
152
+ /// values and writes the averages to the corresponding bits in the
153
+ /// destination.
154
+ #[ inline( always) ]
155
+ #[ target_feature = "+sse" ]
156
+ #[ cfg_attr( test, assert_instr( pavgb) ) ]
157
+ pub unsafe fn _m_pavgb ( a : u8x8 , b : u8x8 ) -> u8x8 {
158
+ _mm_avg_pu8 ( a, b)
159
+ }
160
+
161
+ /// Computes the rounded averages of the packed unsigned 16-bit integer
162
+ /// values and writes the averages to the corresponding bits in the
163
+ /// destination.
164
+ #[ inline( always) ]
165
+ #[ target_feature = "+sse" ]
166
+ #[ cfg_attr( test, assert_instr( pavgw) ) ]
167
+ pub unsafe fn _mm_avg_pu16 ( a : u16x4 , b : u16x4 ) -> u16x4 {
168
+ mem:: transmute ( pavgw ( mem:: transmute ( a) , mem:: transmute ( b) ) )
169
+ }
170
+
171
+ /// Computes the rounded averages of the packed unsigned 16-bit integer
172
+ /// values and writes the averages to the corresponding bits in the
173
+ /// destination.
174
+ #[ inline( always) ]
175
+ #[ target_feature = "+sse" ]
176
+ #[ cfg_attr( test, assert_instr( pavgw) ) ]
177
+ pub unsafe fn _m_pavgw ( a : u16x4 , b : u16x4 ) -> u16x4 {
178
+ _mm_avg_pu16 ( a, b)
179
+ }
180
+
181
+ /// Subtracts the corresponding 8-bit unsigned integer values of the two
182
+ /// 64-bit vector operands and computes the absolute value for each of the
183
+ /// difference. Then sum of the 8 absolute differences is written to the
184
+ /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
185
+ #[ inline( always) ]
186
+ #[ target_feature = "+sse" ]
187
+ #[ cfg_attr( test, assert_instr( psadbw) ) ]
188
+ pub unsafe fn _mm_sad_pu8 ( a : u8x8 , b : u8x8 ) -> u64 {
189
+ mem:: transmute ( psadbw ( mem:: transmute ( a) , mem:: transmute ( b) ) )
190
+ }
191
+
192
+ /// Subtracts the corresponding 8-bit unsigned integer values of the two
193
+ /// 64-bit vector operands and computes the absolute value for each of the
194
+ /// difference. Then sum of the 8 absolute differences is written to the
195
+ /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
196
+ #[ inline( always) ]
197
+ #[ target_feature = "+sse" ]
198
+ #[ cfg_attr( test, assert_instr( psadbw) ) ]
199
+ pub unsafe fn _m_psadbw ( a : u8x8 , b : u8x8 ) -> u64 {
200
+ _mm_sad_pu8 ( a, b)
201
+ }
202
+
111
203
/// Converts two elements of a 64-bit vector of [2 x i32] into two
112
204
/// floating point values and writes them to the lower 64-bits of the
113
205
/// destination. The remaining higher order elements of the destination are
114
206
/// copied from the corresponding elements in the first operand.
115
207
#[ inline( always) ]
116
208
#[ target_feature = "+sse" ]
117
209
#[ cfg_attr( test, assert_instr( cvtpi2ps) ) ]
118
- pub unsafe fn _mm_cvt_pi2ps ( a : f32x4 , b : i32x2 ) -> f32x4 {
210
+ pub unsafe fn _mm_cvtpi32_ps ( a : f32x4 , b : i32x2 ) -> f32x4 {
119
211
cvtpi2ps ( a, mem:: transmute ( b) )
120
212
}
121
213
214
+ /// Converts two elements of a 64-bit vector of [2 x i32] into two
215
+ /// floating point values and writes them to the lower 64-bits of the
216
+ /// destination. The remaining higher order elements of the destination are
217
+ /// copied from the corresponding elements in the first operand.
218
+ #[ inline( always) ]
219
+ #[ target_feature = "+sse" ]
220
+ #[ cfg_attr( test, assert_instr( cvtpi2ps) ) ]
221
+ pub unsafe fn _mm_cvt_pi2ps ( a : f32x4 , b : i32x2 ) -> f32x4 {
222
+ _mm_cvtpi32_ps ( a, b)
223
+ }
224
+
225
+ /// Converts the two 32-bit signed integer values from each 64-bit vector
226
+ /// operand of [2 x i32] into a 128-bit vector of [4 x float].
227
+ #[ inline( always) ]
228
+ #[ target_feature = "+sse" ]
229
+ #[ cfg_attr( test, assert_instr( cvtpi2ps) ) ]
230
+ pub unsafe fn _mm_cvtpi32x2_ps ( a : i32x2 , b : i32x2 ) -> f32x4 {
231
+ let c = i586:: _mm_setzero_ps ( ) ;
232
+ let c = _mm_cvtpi32_ps ( c, b) ;
233
+ let c = i586:: _mm_movelh_ps ( c, c) ;
234
+ _mm_cvtpi32_ps ( c, a)
235
+ }
236
+
237
+ /// Conditionally copies the values from each 8-bit element in the first
238
+ /// 64-bit integer vector operand to the specified memory location, as
239
+ /// specified by the most significant bit in the corresponding element in the
240
+ /// second 64-bit integer vector operand.
241
+ ///
242
+ /// To minimize caching, the data is flagged as non-temporal
243
+ /// (unlikely to be used again soon).
244
+ #[ inline( always) ]
245
+ #[ target_feature = "+sse" ]
246
+ #[ cfg_attr( test, assert_instr( maskmovq) ) ]
247
+ pub unsafe fn _mm_maskmove_si64 ( a : i8x8 , mask : i8x8 , mem_addr : * mut i8 ) {
248
+ maskmovq ( mem:: transmute ( a) , mem:: transmute ( mask) , mem_addr)
249
+ }
250
+
251
+ /// Conditionally copies the values from each 8-bit element in the first
252
+ /// 64-bit integer vector operand to the specified memory location, as
253
+ /// specified by the most significant bit in the corresponding element in the
254
+ /// second 64-bit integer vector operand.
255
+ ///
256
+ /// To minimize caching, the data is flagged as non-temporal
257
+ /// (unlikely to be used again soon).
258
+ #[ inline( always) ]
259
+ #[ target_feature = "+sse" ]
260
+ #[ cfg_attr( test, assert_instr( maskmovq) ) ]
261
+ pub unsafe fn _m_maskmovq ( a : i8x8 , mask : i8x8 , mem_addr : * mut i8 ) {
262
+ _mm_maskmove_si64 ( a, mask, mem_addr)
263
+ }
264
+
122
265
/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
123
266
/// returns it, as specified by the immediate integer operand.
124
267
#[ inline( always) ]
@@ -131,6 +274,15 @@ pub unsafe fn _mm_extract_pi16(a: i16x4, imm2: i32) -> i16 {
131
274
constify_imm2 ! ( imm2, call)
132
275
}
133
276
277
+ /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
278
+ /// returns it, as specified by the immediate integer operand.
279
+ #[ inline( always) ]
280
+ #[ target_feature = "+sse" ]
281
+ #[ cfg_attr( test, assert_instr( pextrw, imm2 = 0 ) ) ]
282
+ pub unsafe fn _m_pextrw ( a : i16x4 , imm2 : i32 ) -> i16 {
283
+ _mm_extract_pi16 ( a, imm2)
284
+ }
285
+
134
286
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
135
287
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
136
288
/// specified by the immediate operand `n`.
@@ -144,6 +296,16 @@ pub unsafe fn _mm_insert_pi16(a: i16x4, d: i32, imm2: i32) -> i16x4 {
144
296
constify_imm2 ! ( imm2, call)
145
297
}
146
298
299
+ /// Copies data from the 64-bit vector of [4 x i16] to the destination,
300
+ /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
301
+ /// specified by the immediate operand `n`.
302
+ #[ inline( always) ]
303
+ #[ target_feature = "+sse" ]
304
+ #[ cfg_attr( test, assert_instr( pinsrw, imm2 = 0 ) ) ]
305
+ pub unsafe fn _m_pinsrw ( a : i16x4 , d : i32 , imm2 : i32 ) -> i16x4 {
306
+ _mm_insert_pi16 ( a, d, imm2)
307
+ }
308
+
147
309
/// Takes the most significant bit from each 8-bit element in a 64-bit
148
310
/// integer vector to create a 16-bit mask value. Zero-extends the value to
149
311
/// 32-bit integer and writes it to the destination.
@@ -154,6 +316,16 @@ pub unsafe fn _mm_movemask_pi8(a: i16x4) -> i32 {
154
316
pmovmskb ( mem:: transmute ( a) )
155
317
}
156
318
319
+ /// Takes the most significant bit from each 8-bit element in a 64-bit
320
+ /// integer vector to create a 16-bit mask value. Zero-extends the value to
321
+ /// 32-bit integer and writes it to the destination.
322
+ #[ inline( always) ]
323
+ #[ target_feature = "+sse" ]
324
+ #[ cfg_attr( test, assert_instr( pmovmskb) ) ]
325
+ pub unsafe fn _m_pmovmskb ( a : i16x4 ) -> i32 {
326
+ _mm_movemask_pi8 ( a)
327
+ }
328
+
157
329
/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
158
330
/// destination, as specified by the immediate value operand.
159
331
#[ inline( always) ]
@@ -166,6 +338,15 @@ pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i8) -> i16x4 {
166
338
constify_imm8 ! ( imm8, call)
167
339
}
168
340
341
+ /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
342
+ /// destination, as specified by the immediate value operand.
343
+ #[ inline( always) ]
344
+ #[ target_feature = "+sse" ]
345
+ #[ cfg_attr( test, assert_instr( pshufw, imm8 = 0 ) ) ]
346
+ pub unsafe fn _m_pshufw ( a : i16x4 , imm8 : i8 ) -> i16x4 {
347
+ _mm_shuffle_pi16 ( a, imm8)
348
+ }
349
+
169
350
/// Convert the two lower packed single-precision (32-bit) floating-point
170
351
/// elements in `a` to packed 32-bit integers with truncation.
171
352
#[ inline( always) ]
@@ -229,7 +410,7 @@ pub unsafe fn _mm_cvtps_pi8(a: f32x4) -> i8x8 {
229
410
#[ cfg( test) ]
230
411
mod tests {
231
412
use v128:: f32x4;
232
- use v64:: { i16x4, i32x2, i8x8, u8x8} ;
413
+ use v64:: { i16x4, i32x2, i8x8, u16x4 , u8x8} ;
233
414
use x86:: i686:: sse;
234
415
use stdsimd_test:: simd_test;
235
416
@@ -274,21 +455,87 @@ mod tests {
274
455
}
275
456
276
457
#[ simd_test = "sse" ]
277
- unsafe fn _mm_cvt_pi2ps ( ) {
458
+ unsafe fn _mm_mulhi_pu16 ( ) {
459
+ let ( a, b) = ( u16x4:: splat ( 1000 ) , u16x4:: splat ( 1001 ) ) ;
460
+ let r = sse:: _mm_mulhi_pu16 ( a, b) ;
461
+ assert_eq ! ( r, u16x4:: splat( 15 ) ) ;
462
+ }
463
+
464
+ #[ simd_test = "sse" ]
465
+ unsafe fn _mm_avg_pu8 ( ) {
466
+ let ( a, b) = ( u8x8:: splat ( 3 ) , u8x8:: splat ( 9 ) ) ;
467
+ let r = sse:: _mm_avg_pu8 ( a, b) ;
468
+ assert_eq ! ( r, u8x8:: splat( 6 ) ) ;
469
+
470
+ let r = sse:: _m_pavgb ( a, b) ;
471
+ assert_eq ! ( r, u8x8:: splat( 6 ) ) ;
472
+ }
473
+
474
+ #[ simd_test = "sse" ]
475
+ unsafe fn _mm_avg_pu16 ( ) {
476
+ let ( a, b) = ( u16x4:: splat ( 3 ) , u16x4:: splat ( 9 ) ) ;
477
+ let r = sse:: _mm_avg_pu16 ( a, b) ;
478
+ assert_eq ! ( r, u16x4:: splat( 6 ) ) ;
479
+
480
+ let r = sse:: _m_pavgw ( a, b) ;
481
+ assert_eq ! ( r, u16x4:: splat( 6 ) ) ;
482
+ }
483
+
484
+ #[ simd_test = "sse" ]
485
+ unsafe fn _mm_sad_pu8 ( ) {
486
+ let a = u8x8:: new ( 255 , 254 , 253 , 252 , 1 , 2 , 3 , 4 ) ;
487
+ let b = u8x8:: new ( 0 , 0 , 0 , 0 , 2 , 1 , 2 , 1 ) ;
488
+ let r = sse:: _mm_sad_pu8 ( a, b) ;
489
+ assert_eq ! ( r, 1020 ) ;
490
+
491
+ let r = sse:: _m_psadbw ( a, b) ;
492
+ assert_eq ! ( r, 1020 ) ;
493
+ }
494
+
495
+ #[ simd_test = "sse" ]
496
+ unsafe fn _mm_cvtpi32_ps ( ) {
278
497
let a = f32x4:: new ( 0. , 0. , 3. , 4. ) ;
279
498
let b = i32x2:: new ( 1 , 2 ) ;
280
499
let expected = f32x4:: new ( 1. , 2. , 3. , 4. ) ;
500
+ let r = sse:: _mm_cvtpi32_ps ( a, b) ;
501
+ assert_eq ! ( r, expected) ;
502
+
281
503
let r = sse:: _mm_cvt_pi2ps ( a, b) ;
282
504
assert_eq ! ( r, expected) ;
283
505
}
284
506
507
+ #[ simd_test = "sse" ]
508
+ unsafe fn _mm_cvtpi32x2_ps ( ) {
509
+ let a = i32x2:: new ( 1 , 2 ) ;
510
+ let b = i32x2:: new ( 3 , 4 ) ;
511
+ let expected = f32x4:: new ( 1. , 2. , 3. , 4. ) ;
512
+ let r = sse:: _mm_cvtpi32x2_ps ( a, b) ;
513
+ assert_eq ! ( r, expected) ;
514
+ }
515
+
516
+ #[ simd_test = "sse" ]
517
+ unsafe fn _mm_maskmove_si64 ( ) {
518
+ let a = i8x8:: splat ( 9 ) ;
519
+ let mask = i8x8:: splat ( 0 ) . replace ( 2 , 0x80u8 as i8 ) ;
520
+ let mut r = i8x8:: splat ( 0 ) ;
521
+ sse:: _mm_maskmove_si64 ( a, mask, & mut r as * mut _ as * mut i8 ) ;
522
+ assert_eq ! ( r, i8x8:: splat( 0 ) . replace( 2 , 9 ) ) ;
523
+
524
+ let mut r = i8x8:: splat ( 0 ) ;
525
+ sse:: _m_maskmovq ( a, mask, & mut r as * mut _ as * mut i8 ) ;
526
+ assert_eq ! ( r, i8x8:: splat( 0 ) . replace( 2 , 9 ) ) ;
527
+ }
528
+
285
529
#[ simd_test = "sse" ]
286
530
unsafe fn _mm_extract_pi16 ( ) {
287
531
let a = i16x4:: new ( 1 , 2 , 3 , 4 ) ;
288
532
let r = sse:: _mm_extract_pi16 ( a, 0 ) ;
289
533
assert_eq ! ( r, 1 ) ;
290
534
let r = sse:: _mm_extract_pi16 ( a, 1 ) ;
291
535
assert_eq ! ( r, 2 ) ;
536
+
537
+ let r = sse:: _m_pextrw ( a, 1 ) ;
538
+ assert_eq ! ( r, 2 ) ;
292
539
}
293
540
294
541
#[ simd_test = "sse" ]
@@ -300,13 +547,19 @@ mod tests {
300
547
let r = sse:: _mm_insert_pi16 ( a, 0 , 0b10 ) ;
301
548
let expected = i16x4:: new ( 1 , 2 , 0 , 4 ) ;
302
549
assert_eq ! ( r, expected) ;
550
+
551
+ let r = sse:: _m_pinsrw ( a, 0 , 0b10 ) ;
552
+ assert_eq ! ( r, expected) ;
303
553
}
304
554
305
555
#[ simd_test = "sse" ]
306
556
unsafe fn _mm_movemask_pi8 ( ) {
307
557
let a = i16x4:: new ( 0b1000_0000 , 0b0100_0000 , 0b1000_0000 , 0b0100_0000 ) ;
308
558
let r = sse:: _mm_movemask_pi8 ( a) ;
309
559
assert_eq ! ( r, 0b10001 ) ;
560
+
561
+ let r = sse:: _m_pmovmskb ( a) ;
562
+ assert_eq ! ( r, 0b10001 ) ;
310
563
}
311
564
312
565
#[ simd_test = "sse" ]
@@ -315,6 +568,9 @@ mod tests {
315
568
let r = sse:: _mm_shuffle_pi16 ( a, 0b00_01_01_11 ) ;
316
569
let expected = i16x4:: new ( 4 , 2 , 2 , 1 ) ;
317
570
assert_eq ! ( r, expected) ;
571
+
572
+ let r = sse:: _m_pshufw ( a, 0b00_01_01_11 ) ;
573
+ assert_eq ! ( r, expected) ;
318
574
}
319
575
320
576
#[ simd_test = "sse" ]
0 commit comments