@@ -88,6 +88,26 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
88
88
static __device__ __forceinline__ int __dp4a (const int a, const int b, int c) {
89
89
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
90
90
c = __builtin_amdgcn_sdot4 (a, b, c, false );
91
+ #elif defined(__gfx1010__)// || defined(__gfx900__)
92
+ int ashift;
93
+ int bshift;
94
+ int aext;
95
+ int bext;
96
+ asm (" \n \
97
+ v_pk_ashrrev_i16 %1, 0x80008, %5 \n \
98
+ v_pk_ashrrev_i16 %2, 0x80008, %6 \n \
99
+ v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_1 src0_sel:BYTE_2 \n \
100
+ v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \
101
+ v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_1 src0_sel:BYTE_2 \n \
102
+ v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \
103
+ v_mad_i32_i16 %0, %1, %2, %0 op_sel:[0, 0, 0, 0] \n \
104
+ v_mad_i32_i16 %0, %1, %2, %0 op_sel:[1, 1, 0, 0] \n \
105
+ v_mad_i32_i16 %0, %3, %4, %0 op_sel:[0, 0, 0, 0] \n \
106
+ v_mad_i32_i16 %0, %3, %4, %0 op_sel:[1, 1, 0, 0] \n \
107
+ "
108
+ : " +v" (c), " =&v" (ashift), " =&v" (bshift), " =&v" (aext), " =&v" (bext)
109
+ : " v" (a), " v" (b)
110
+ );
91
111
#else
92
112
const int8x4_t va = reinterpret_cast <const int8x4_t &>(a);
93
113
const int8x4_t vb = reinterpret_cast <const int8x4_t &>(b);
0 commit comments