From 0490bcc63eb8c71255a6796e0c481c51f13a9eab Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Sun, 21 Jun 2020 17:35:33 +0800 Subject: [PATCH 1/8] Add _mm_loadu_si64 Fix #40 --- crates/core_arch/src/x86/sse.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 3de09ca964..ed25280576 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1251,6 +1251,25 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { simd_shuffle4(a, a, [3, 2, 1, 0]) } +/// Loads unaligned 64-bits of integer data from memory into new vector. +/// +/// `mem_addr` does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) +#[inline] +#[target_feature(enable = "sse")] +#[cfg_attr(test, assert_instr(movups))] +#[stable(feature = "simd_x86", since = "1.46.0")] +pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { + let mut dst = _mm_setzero_si128(); + ptr::copy_nonoverlapping( + mem_addr, + &mut dst as *mut __m128i as *mut u8, + 8, // == 64 bits == mem::size_of::<__m128i>() / 2 + ); + dst +} + /// Stores the upper half of `a` (64 bits) into memory. /// /// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may @@ -3658,6 +3677,13 @@ mod tests { assert_eq_m128(r, e); } + #[simd_test(enable = "sse2")] + unsafe fn test_mm_loadu_si64() { + let a = _mm_set_epi64x(5, 0); + let r = _mm_loadu_si64(&a as *const _ as *const _); + assert_eq_m128i(a, r); + } + #[simd_test(enable = "sse")] unsafe fn test_mm_storeh_pi() { let mut vals = [0.0f32; 8]; From e262ee25df654b9e8eae9d4054b01929f1eb53ce Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Sun, 21 Jun 2020 23:25:37 +0800 Subject: [PATCH 2/8] Simplify _mm_loadu_si64 As suggested by @Amanieu --- crates/core_arch/src/x86/sse.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index ed25280576..9b040b498d 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1259,15 +1259,9 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { #[inline] #[target_feature(enable = "sse")] #[cfg_attr(test, assert_instr(movups))] -#[stable(feature = "simd_x86", since = "1.46.0")] +#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { - let mut dst = _mm_setzero_si128(); - ptr::copy_nonoverlapping( - mem_addr, - &mut dst as *mut __m128i as *mut u8, - 8, // == 64 bits == mem::size_of::<__m128i>() / 2 - ); - dst + _mm_set_epi64x(ptr::read_unaligned(mem_addr as *const i64), 0) } /// Stores the upper half of `a` (64 bits) into memory. From ff9299c929f8e6ca46f51fec1bdc40b332659777 Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Sun, 12 Jul 2020 01:04:45 +0800 Subject: [PATCH 3/8] Assert _mm_loadu_si64 uses movq As suggested by @Amanieu --- crates/core_arch/src/x86/sse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 9b040b498d..a5019e60a2 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1258,7 +1258,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movups))] +#[cfg_attr(test, assert_instr(movq))] #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { _mm_set_epi64x(ptr::read_unaligned(mem_addr as *const i64), 0) From 0d252c52f7b6d57f6970f358055d557d65e0767e Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Sun, 12 Jul 2020 01:16:41 +0800 Subject: [PATCH 4/8] Fix _mm_loadu_si64 test --- crates/core_arch/src/x86/sse.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index a5019e60a2..c872a61695 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -3673,9 +3673,9 @@ mod tests { #[simd_test(enable = "sse2")] unsafe fn test_mm_loadu_si64() { - let a = _mm_set_epi64x(5, 0); + let a = _mm_setr_epi64x(5, 6); let r = _mm_loadu_si64(&a as *const _ as *const _); - assert_eq_m128i(a, r); + assert_eq_m128i(r, _mm_set_epi64x(5, 0)); } #[simd_test(enable = "sse")] From e58d0dda0dd7fdc10d91e2b2769fbd148063a5c5 Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Mon, 13 Jul 2020 01:31:18 +0800 Subject: [PATCH 5/8] Make _mm_loadu_si64 not depends on sse2 --- crates/core_arch/src/x86/sse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index c872a61695..dad3701423 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1261,7 +1261,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { #[cfg_attr(test, assert_instr(movq))] #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { - _mm_set_epi64x(ptr::read_unaligned(mem_addr as *const i64), 0) + transmute(i64x2(0, ptr::read_unaligned(mem_addr as *const i64))) } /// Stores the upper half of `a` (64 bits) into memory. From d84542f0034166e9a50990d168bf0bf13b7d3afb Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Mon, 13 Jul 2020 22:23:34 +0800 Subject: [PATCH 6/8] Disable assertion for 32-bit x86 for _mm_loadu_si64 --- crates/core_arch/src/x86/sse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index dad3701423..0840e90e74 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1258,7 +1258,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(test, assert_instr(movq))] +#[cfg_attr(all(test, not(target = "x86")), assert_instr(movq))] #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { transmute(i64x2(0, ptr::read_unaligned(mem_addr as *const i64))) From 08a41767c0cebfbc5a6695e16c54cbac3e69ed38 Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Tue, 14 Jul 2020 22:49:36 +0800 Subject: [PATCH 7/8] Remove const_transmute feature --- crates/stdarch-test/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index fa73a7bba6..38025b1701 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -3,7 +3,6 @@ //! This basically just disassembles the current executable and then parses the //! output once globally and then provides the `assert` function which makes //! assertions about the disassembly of a function. -#![feature(const_transmute)] #![feature(vec_leak)] #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)] From 2f24f8913faaa960a689b81dad8552ba22bf2ed3 Mon Sep 17 00:00:00 2001 From: Ivan Tham Date: Thu, 16 Jul 2020 21:43:11 +0800 Subject: [PATCH 8/8] Update crates/core_arch/src/x86/sse.rs Co-authored-by: Amanieu d'Antras --- crates/core_arch/src/x86/sse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs index 0840e90e74..551da7652b 100644 --- a/crates/core_arch/src/x86/sse.rs +++ b/crates/core_arch/src/x86/sse.rs @@ -1258,7 +1258,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) #[inline] #[target_feature(enable = "sse")] -#[cfg_attr(all(test, not(target = "x86")), assert_instr(movq))] +#[cfg_attr(all(test, not(target_arch = "x86")), assert_instr(movq))] #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { transmute(i64x2(0, ptr::read_unaligned(mem_addr as *const i64)))