Skip to content

Commit ac2f14d

Browse files
Engininja2YellowRoseCx
authored andcommitted
AMD assembly optimized __dp4a
Doesn't seem to work for gfx900, so commented out.
1 parent 9dba0c9 commit ac2f14d

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

ggml-cuda.cu

+20
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,26 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
8888
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
8989
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
9090
c = __builtin_amdgcn_sdot4(a, b, c, false);
91+
#elif defined(__gfx1010__)// || defined(__gfx900__)
92+
int ashift;
93+
int bshift;
94+
int aext;
95+
int bext;
96+
asm("\n \
97+
v_pk_ashrrev_i16 %1, 0x80008, %5 \n \
98+
v_pk_ashrrev_i16 %2, 0x80008, %6 \n \
99+
v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_1 src0_sel:BYTE_2 \n \
100+
v_mov_b32_sdwa %3, sext(%5) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \
101+
v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_1 src0_sel:BYTE_2 \n \
102+
v_mov_b32_sdwa %4, sext(%6) dst_sel:WORD_0 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 \n \
103+
v_mad_i32_i16 %0, %1, %2, %0 op_sel:[0, 0, 0, 0] \n \
104+
v_mad_i32_i16 %0, %1, %2, %0 op_sel:[1, 1, 0, 0] \n \
105+
v_mad_i32_i16 %0, %3, %4, %0 op_sel:[0, 0, 0, 0] \n \
106+
v_mad_i32_i16 %0, %3, %4, %0 op_sel:[1, 1, 0, 0] \n \
107+
"
108+
: "+v"(c), "=&v"(ashift), "=&v"(bshift), "=&v"(aext), "=&v"(bext)
109+
: "v"(a), "v"(b)
110+
);
91111
#else
92112
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
93113
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);

0 commit comments

Comments
 (0)