Add StoreInterleaved2. Refs #641

jan-wassenberg · copybara-github · commit 6c285d64ae50 · 2022-04-25T08:55:31.000-07:00
PiperOrigin-RevId: 444281375
diff --git a/g3doc/quick_reference.md b/g3doc/quick_reference.md
@@ -790,11 +790,15 @@ F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`.
     not fault, unlike `BlendedStore`. No alignment requirement. Potentially
     non-atomic, like `BlendedStore`.
 
+*   `D`: `u8` \
+    <code>void **StoreInterleaved2**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1, D, T*
+    p)</code>: equivalent to shuffling `v0, v1` followed by two `StoreU()`, such
+    that `p[0] == v0[0], p[1] == v1[0]`.
+
 *   `D`: `u8` \
     <code>void **StoreInterleaved3**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
-    Vec&lt;D&gt; v2, D, T* p)</code>: equivalent to shuffling `v0, v1, v2`
-    followed by three `StoreU()`, such that `p[0] == v0[0], p[1] == v1[0],
-    p[2] == v1[0]`. Useful for RGB samples.
+    Vec&lt;D&gt; v2, D, T* p)</code>: as above, but for three vectors (e.g. RGB
+    samples).
 
 *   `D`: `u8` \
     <code>void **StoreInterleaved4**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
diff --git a/hwy/ops/arm_neon-inl.h b/hwy/ops/arm_neon-inl.h
@@ -5334,6 +5334,37 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
   return PopCount(mask_bits);
 }
 
+// ------------------------------ StoreInterleaved2
+
+// 128 bits
+HWY_API void StoreInterleaved2(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1,
+                               Full128<uint8_t> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const uint8x16x2_t pair = {{v0.raw, v1.raw}};
+  vst2q_u8(unaligned, pair);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved2(const Vec64<uint8_t> v0, const Vec64<uint8_t> v1,
+                               Full64<uint8_t> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const uint8x8x2_t pair = {{v0.raw, v1.raw}};
+  vst2_u8(unaligned, pair);
+}
+
+// <= 32 bits: avoid writing more than N bytes by copying to buffer
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved2(const Vec128<uint8_t, N> v0,
+                               const Vec128<uint8_t, N> v1,
+                               Simd<uint8_t, N, 0> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  alignas(16) uint8_t buf[16];
+  const uint8x8x2_t pair = {{v0.raw, v1.raw}};
+  vst2_u8(buf, pair);
+  CopyBytes<N * 2>(buf, unaligned);
+}
+
 // ------------------------------ StoreInterleaved3
 
 // 128 bits
diff --git a/hwy/ops/arm_sve-inl.h b/hwy/ops/arm_sve-inl.h
@@ -1009,6 +1009,20 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_INDEX, GatherIndex, ld1_gather)
 #undef HWY_SVE_GATHER_OFFSET
 #undef HWY_SVE_GATHER_INDEX
 
+// ------------------------------ StoreInterleaved2
+
+#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)                 \
+  template <size_t N, int kPow2>                                         \
+  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1,  \
+                    HWY_SVE_D(BASE, BITS, N, kPow2) d,                   \
+                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) {    \
+    const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
+    sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple);        \
+  }
+HWY_SVE_FOREACH_U08(HWY_SVE_STORE2, StoreInterleaved2, st2)
+
+#undef HWY_SVE_STORE2
+
 // ------------------------------ StoreInterleaved3
 
 #define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)                      \
diff --git a/hwy/ops/emu128-inl.h b/hwy/ops/emu128-inl.h
@@ -1201,7 +1201,18 @@ HWY_API void BlendedStore(const Vec128<T, N> v, Mask128<T, N> m,
   }
 }
 
-// ------------------------------ StoreInterleaved3
+// ------------------------------ StoreInterleaved2/3/4
+
+template <size_t N>
+HWY_API void StoreInterleaved2(const Vec128<uint8_t, N> v0,
+                               const Vec128<uint8_t, N> v1,
+                               Simd<uint8_t, N, 0> /* tag */,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  for (size_t i = 0; i < N; ++i) {
+    *unaligned++ = v0.raw[i];
+    *unaligned++ = v1.raw[i];
+  }
+}
 
 template <size_t N>
 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h
@@ -1386,6 +1386,30 @@ HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
   return GatherOffset(d, base, ShiftLeft<3>(index));
 }
 
+// ------------------------------ StoreInterleaved2
+
+#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
+                       MLEN, NAME, OP)                                         \
+  template <size_t N>                                                          \
+  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0,                             \
+                    HWY_RVV_V(BASE, SEW, LMUL) v1,                             \
+                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                          \
+                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {           \
+    return v##OP##e8_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d));         \
+  }
+// Segments are limited to 8 registers, so we can only go up to LMUL=2.
+HWY_RVV_STORE2(uint, u, 8, _, _, mf8, _, _, /*kShift=*/-3, 64,
+               StoreInterleaved2, sseg2)
+HWY_RVV_STORE2(uint, u, 8, _, _, mf4, _, _, /*kShift=*/-2, 32,
+               StoreInterleaved2, sseg2)
+HWY_RVV_STORE2(uint, u, 8, _, _, mf2, _, _, /*kShift=*/-1, 16,
+               StoreInterleaved2, sseg2)
+HWY_RVV_STORE2(uint, u, 8, _, _, m1, _, _, /*kShift=*/0, 8, StoreInterleaved2,
+               sseg2)
+HWY_RVV_STORE2(uint, u, 8, _, _, m2, _, _, /*kShift=*/1, 4, StoreInterleaved2,
+               sseg2)
+#undef HWY_RVV_STORE2
+
 // ------------------------------ StoreInterleaved3
 
 #define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
diff --git a/hwy/ops/scalar-inl.h b/hwy/ops/scalar-inl.h
@@ -954,7 +954,14 @@ HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, Sisd<T> d,
   StoreU(v, d, p);
 }
 
-// ------------------------------ StoreInterleaved3
+// ------------------------------ StoreInterleaved2/3/4
+
+HWY_API void StoreInterleaved2(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
+                               Sisd<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  StoreU(v0, d, unaligned + 0);
+  StoreU(v1, d, unaligned + 1);
+}
 
 HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
                                const Vec1<uint8_t> v2, Sisd<uint8_t> d,
diff --git a/hwy/ops/wasm_128-inl.h b/hwy/ops/wasm_128-inl.h
@@ -3767,6 +3767,53 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
   return PopCount(mask_bits);
 }
 
+// ------------------------------ StoreInterleaved2
+
+// 128 bits
+HWY_API void StoreInterleaved2(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1, Full128<uint8_t> d8,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const RepartitionToWide<decltype(d8)> d16;
+  // let a,b denote v0,v1.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
+  const auto ba8 = ZipUpper(d16, v0, v1);
+  StoreU(BitCast(d8, ba0), d8, unaligned + 0 * 16);
+  StoreU(BitCast(d8, ba8), d8, unaligned + 1 * 16);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved2(const Vec64<uint8_t> in0,
+                               const Vec64<uint8_t> in1,
+                               Full64<uint8_t> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<uint8_t> d_full8;
+  const RepartitionToWide<decltype(d_full8)> d16;
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  // let a,b,c,d denote v0,v1.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
+  StoreU(BitCast(d_full8, ba0), d_full8, unaligned + 0 * 16);
+}
+
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved2(const Vec128<uint8_t, N> in0,
+                               const Vec128<uint8_t, N> in1,
+                               Simd<uint8_t, N, 0> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<uint8_t> d_full8;
+  const RepartitionToWide<decltype(d_full8)> d16;
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b3 a3 .. b0 a0
+  alignas(16) uint8_t buf[16];
+  StoreU(BitCast(d_full8, ba0), d_full8, buf);
+  CopyBytes<4 * N>(buf, unaligned);
+}
+
 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
 // TableLookupBytes)
 
diff --git a/hwy/ops/wasm_256-inl.h b/hwy/ops/wasm_256-inl.h
@@ -2808,6 +2808,14 @@ HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
   return PopCount(mask_bits);
 }
 
+// ------------------------------ StoreInterleaved2
+
+HWY_API void StoreInterleaved2(const Vec256<uint8_t> a, const Vec256<uint8_t> b,
+                               Full256<uint8_t> d,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  HWY_ASSERT(0);
+}
+
 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
 // TableLookupBytes)
 
diff --git a/hwy/ops/x86_128-inl.h b/hwy/ops/x86_128-inl.h
@@ -6351,6 +6351,53 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
 
 #endif  // HWY_TARGET <= HWY_AVX3
 
+// ------------------------------ StoreInterleaved2
+
+// 128 bits
+HWY_API void StoreInterleaved2(const Vec128<uint8_t> v0,
+                               const Vec128<uint8_t> v1, Full128<uint8_t> d8,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const RepartitionToWide<decltype(d8)> d16;
+  // let a,b denote v0,v1.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
+  const auto ba8 = ZipUpper(d16, v0, v1);
+  StoreU(BitCast(d8, ba0), d8, unaligned + 0 * 16);
+  StoreU(BitCast(d8, ba8), d8, unaligned + 1 * 16);
+}
+
+// 64 bits
+HWY_API void StoreInterleaved2(const Vec64<uint8_t> in0,
+                               const Vec64<uint8_t> in1,
+                               Full64<uint8_t> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<uint8_t> d_full8;
+  const RepartitionToWide<decltype(d_full8)> d16;
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  // let a,b,c,d denote v0,v1.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
+  StoreU(BitCast(d_full8, ba0), d_full8, unaligned + 0 * 16);
+}
+
+// <= 32 bits
+template <size_t N, HWY_IF_LE32(uint8_t, N)>
+HWY_API void StoreInterleaved2(const Vec128<uint8_t, N> in0,
+                               const Vec128<uint8_t, N> in1,
+                               Simd<uint8_t, N, 0> /*tag*/,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<uint8_t> d_full8;
+  const RepartitionToWide<decltype(d_full8)> d16;
+  const Vec128<uint8_t> v0{in0.raw};
+  const Vec128<uint8_t> v1{in1.raw};
+  // let a,b,c,d denote v0..3.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b3 a3 .. b0 a0
+  alignas(16) uint8_t buf[16];
+  StoreU(BitCast(d_full8, ba0), d_full8, buf);
+  CopyBytes<2 * N>(buf, unaligned);
+}
+
 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
 // TableLookupBytes)
 
diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
@@ -4786,6 +4786,23 @@ HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
 
 #endif  // HWY_TARGET <= HWY_AVX3
 
+// ------------------------------ StoreInterleaved2
+
+HWY_API void StoreInterleaved2(const Vec256<uint8_t> v0,
+                               const Vec256<uint8_t> v1, Full256<uint8_t> d8,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const RepartitionToWide<decltype(d8)> d16;
+  // let a,b denote v0,v1.
+  const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
+  const auto ba8 = ZipUpper(d16, v0, v1);
+  // Write lower halves, then upper. vperm2i128 is slow on Zen1 but we can
+  // efficiently combine two lower halves into 256 bits:
+  const auto out0 = BitCast(d8, ConcatLowerLower(d16, ba8, ba0));
+  const auto out1 = BitCast(d8, ConcatUpperUpper(d16, ba8, ba0));
+  StoreU(out0, d8, unaligned + 0 * 32);
+  StoreU(out1, d8, unaligned + 1 * 32);
+}
+
 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
 // TableLookupBytes, ConcatUpperLower)
 
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
@@ -3678,6 +3678,26 @@ HWY_API size_t CompressBitsStore(Vec512<T> v, const uint8_t* HWY_RESTRICT bits,
   return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
 }
 
+// ------------------------------ StoreInterleaved2
+
+HWY_API void StoreInterleaved2(const Vec512<uint8_t> v0,
+                               const Vec512<uint8_t> v1, Full512<uint8_t> d8,
+                               uint8_t* HWY_RESTRICT unaligned) {
+  const RepartitionToWide<decltype(d8)> d16;
+  // let a,b denote v0,v1.
+  const auto i = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0 in lower 128 bits
+  const auto j = ZipUpper(d16, v0, v1);
+  // 2x4 transpose: interleave 128-bit blocks.
+  const __m512i j1_j0_i1_i0 = _mm512_shuffle_i64x2(i.raw, j.raw, _MM_PERM_BABA);
+  const __m512i j3_j2_i3_i2 = _mm512_shuffle_i64x2(i.raw, j.raw, _MM_PERM_DCDC);
+  const __m512i j1_i1_j0_i0 =
+      _mm512_shuffle_i64x2(j1_j0_i1_i0, j1_j0_i1_i0, _MM_PERM_DBCA);
+  const __m512i j3_i3_j2_i2 =
+      _mm512_shuffle_i64x2(j3_j2_i3_i2, j3_j2_i3_i2, _MM_PERM_DBCA);
+  StoreU(Vec512<uint8_t>{j1_i1_j0_i0}, d8, unaligned + 0 * 64);
+  StoreU(Vec512<uint8_t>{j3_i3_j2_i2}, d8, unaligned + 1 * 64);
+}
+
 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
 // TableLookupBytes)
 
diff --git a/hwy/tests/memory_test.cc b/hwy/tests/memory_test.cc
@@ -131,6 +131,58 @@ HWY_NOINLINE void TestAllSafeCopyN() {
   ForAllTypes(ForPartialVectors<TestSafeCopyN>());
 }
 
+struct TestStoreInterleaved2 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<uint8_t>(2 * N);
+    for (size_t i = 0; i < 2 * N; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(3 * N);
+    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[2 * i + 0] = bytes[0 * N + i];
+        expected[2 * i + 1] = bytes[1 * N + i];
+        // Ensure we do not write more than 2*N bytes
+        expected[2 * N + i] = actual[2 * N + i] = 0;
+      }
+      StoreInterleaved2(in0, in1, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 3 * N, &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved %d %d %d %d  %d %d %d %d\n", actual[i],
+                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+                actual[i + 5], actual[i + 6], actual[i + 7]);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreInterleaved2() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestStoreInterleaved2, 2> test;
+#else
+  const ForPartialVectors<TestStoreInterleaved2> test;
+#endif
+  test(uint8_t());
+}
+
 struct TestStoreInterleaved3 {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -443,6 +495,7 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyMemoryTest);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved2);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);