We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
2 parents 93dbb26 + 8e37f2d commit b12acc6Copy full SHA for b12acc6
exllama_ext/cuda_func/q4_matmul.cu
@@ -8,10 +8,10 @@
8
#include "../hip_compat.cuh"
9
#endif
10
11
-const int THREADS_X = 128; // Block size and thread count along columns in w and out
+const int THREADS_X = 32; // Block size and thread count along columns in w and out
12
const int THREADS_Y = 1; // Block size and thread count along rows in x and out
13
14
-const int GROUP_STEP = 128; // Assumed group size when block_size_z % groupsize != 0
+const int GROUP_STEP = 32; // Assumed group size when block_size_z % groupsize != 0
15
16
typedef void (*fp_q4_matmul_kernel)
17
(
0 commit comments