Skip to content

Commit 20b2d6c

Browse files
committed
Improve cpu prompt eval speed
This change upstreams llamafile's cpu matrix multiplication kernels which improve image and prompt evaluation speed. For starters, Q4_0 and Q8_0 weights should go ~40% faster on CPU. The biggest benefits are with data types like f16 / f32, which process prompts 2x faster thus making them faster than quantized data types for prompt evals. This change also introduces bona fide AVX512 support since tinyBLAS is able to exploit the larger register file. For example, on my CPU llama.cpp llava-cli processes an image prompt at 305 tokens/second, using the Q4_K and Q4_0 types, which has always been faster than if we used f16 LLaVA weights, which at HEAD go 188 tokens/second. With this change, f16 LLaVA performance leap frogs to 464 tokens/second. On Intel Core i9-14900K this change improves F16 prompt perf by 5x. For example, using llama.cpp at HEAD with Mistral 7b f16 to process a 215 token prompt will go 13 tok/sec. This change has fixes making it go 52 tok/sec. It's mostly thanks to my vectorized outer product kernels but also because I added support for correctly counting the number of cores on Alderlake, so the default thread count discounts Intel's new efficiency cores. Only Linux right now can count cores. This work was sponsored by Mozilla who's given permission to change the license of this code from Apache 2.0 to MIT. To read more about what's improved, and how it works, see: https://justine.lol/matmul/
1 parent 37e7854 commit 20b2d6c

File tree

10 files changed

+1319
-9
lines changed

10 files changed

+1319
-9
lines changed

CMakeLists.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,8 @@ add_library(ggml OBJECT
11511151
ggml-backend.h
11521152
ggml-quants.c
11531153
ggml-quants.h
1154+
sgemm.cpp
1155+
sgemm.h
11541156
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
11551157
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
11561158
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -676,13 +676,16 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
676676
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
677677
$(CC) $(CFLAGS) -c $< -o $@
678678

679+
sgemm.o: sgemm.cpp sgemm.h ggml.h
680+
$(CXX) $(CXXFLAGS) -c $< -o $@
681+
679682
unicode.o: unicode.cpp unicode.h
680683
$(CXX) $(CXXFLAGS) -c $< -o $@
681684

682685
unicode-data.o: unicode-data.cpp unicode-data.h
683686
$(CXX) $(CXXFLAGS) -c $< -o $@
684687

685-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
688+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o sgemm.o
686689

687690
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
688691
$(CXX) $(CXXFLAGS) -c $< -o $@

Package.swift

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ let package = Package(
3030
],
3131
sources: [
3232
"ggml.c",
33+
"sgemm.cpp",
3334
"llama.cpp",
3435
"unicode.cpp",
3536
"unicode-data.cpp",

build.zig

+8-7
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
112112
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
113113

114114
const ggml = make.obj("ggml", "ggml.c");
115+
const sgemm = make.obj("sgemm", "sgemm.cpp");
115116
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
116117
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
117118
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@@ -128,14 +129,14 @@ pub fn build(b: *std.build.Builder) !void {
128129
const clip = make.obj("clip", "examples/llava/clip.cpp");
129130
const llava = make.obj("llava", "examples/llava/llava.cpp");
130131

131-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
132-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
133-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
134-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
135-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
136-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
132+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
133+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
134+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
135+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
136+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
137+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
137138

138-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
139+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
139140
if (server.target.isWindows()) {
140141
server.linkSystemLibrary("ws2_32");
141142
}

common/common.cpp

+69
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,75 @@ int32_t get_num_physical_cores() {
104104
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
105105
}
106106

107+
#if defined(__x86_64__) && defined(__linux__)
108+
#include <pthread.h>
109+
110+
static void cpuid(unsigned leaf, unsigned subleaf,
111+
unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
112+
__asm__("movq\t%%rbx,%%rsi\n\t"
113+
"cpuid\n\t"
114+
"xchgq\t%%rbx,%%rsi"
115+
: "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
116+
: "0"(leaf), "2"(subleaf));
117+
}
118+
119+
static int pin_cpu(int cpu) {
120+
cpu_set_t mask;
121+
CPU_ZERO(&mask);
122+
CPU_SET(cpu, &mask);
123+
return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
124+
}
125+
126+
static bool is_hybrid_cpu(void) {
127+
unsigned eax, ebx, ecx, edx;
128+
cpuid(7, 0, &eax, &ebx, &ecx, &edx);
129+
return !!(edx & (1u << 15));
130+
}
131+
132+
static bool is_running_on_efficiency_core(void) {
133+
unsigned eax, ebx, ecx, edx;
134+
cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
135+
int intel_atom = 0x20;
136+
int core_type = (eax & 0xff000000u) >> 24;
137+
return core_type == intel_atom;
138+
}
139+
140+
static int count_math_cpus(int cpu_count) {
141+
int result = 0;
142+
for (int cpu = 0; cpu < cpu_count; ++cpu) {
143+
if (pin_cpu(cpu))
144+
return -1;
145+
if (is_running_on_efficiency_core())
146+
continue; // efficiency cores harm lockstep threading
147+
++cpu; // hyperthreading isn't useful for linear algebra
148+
++result;
149+
}
150+
return result;
151+
}
152+
153+
#endif // __x86_64__ && __linux__
154+
155+
/**
156+
* Returns number of CPUs on system that are useful for math.
157+
*/
158+
int get_math_cpu_count() {
159+
#if defined(__x86_64__) && defined(__linux__)
160+
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
161+
if (cpu_count < 1)
162+
return get_num_physical_cores();
163+
if (is_hybrid_cpu()) {
164+
cpu_set_t affinity;
165+
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
166+
int result = count_math_cpus(cpu_count);
167+
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
168+
if (result > 0)
169+
return result;
170+
}
171+
}
172+
#endif
173+
return get_num_physical_cores();
174+
}
175+
107176
void process_escapes(std::string & input) {
108177
std::size_t input_len = input.length();
109178
std::size_t output_idx = 0;

common/common.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;
3939

4040
struct llama_control_vector_load_info;
4141

42+
int get_math_cpu_count();
4243
int32_t get_num_physical_cores();
4344

4445
//
@@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
4849
struct gpt_params {
4950
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
5051

51-
int32_t n_threads = get_num_physical_cores();
52+
int32_t n_threads = get_math_cpu_count();
5253
int32_t n_threads_draft = -1;
5354
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
5455
int32_t n_threads_batch_draft = -1;

ggml.c

+44
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "ggml-impl.h"
55
#include "ggml-quants.h"
66
#include "ggml.h"
7+
#include "sgemm.h"
78

89
#if defined(_MSC_VER) || defined(__MINGW32__)
910
#include <malloc.h> // using malloc.h with MSC/MINGW
@@ -10817,6 +10818,27 @@ static void ggml_compute_forward_mul_mat(
1081710818
}
1081810819
#endif
1081910820

10821+
if (src1_cont) {
10822+
for (int64_t j = 0; j < ne13; j++)
10823+
for (int64_t i = 0; i < ne12; i++)
10824+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10825+
(const char *)src0->data + i/r2*nb02 + j/r3*nb03,
10826+
nb01/ggml_type_size(src0->type),
10827+
(const char *)src1->data + i*nb12 + j*nb13,
10828+
nb11/ggml_type_size(src1->type),
10829+
(char *)dst->data + i*nb2 + j*nb3,
10830+
nb1/ggml_type_size(dst->type),
10831+
ith, nth,
10832+
params->type,
10833+
src0->type,
10834+
src1->type,
10835+
dst->type))
10836+
goto UseGgmlGemm1;
10837+
return;
10838+
}
10839+
UseGgmlGemm1:
10840+
(void)0;
10841+
1082010842
if (params->type == GGML_TASK_TYPE_INIT) {
1082110843
if (ith != 0) {
1082210844
return;
@@ -10848,6 +10870,28 @@ static void ggml_compute_forward_mul_mat(
1084810870
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
1084910871
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
1085010872

10873+
if (src1_cont) {
10874+
for (int64_t j = 0; j < ne13; j++)
10875+
for (int64_t i = 0; i < ne12; i++)
10876+
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10877+
(const char *)src0->data + i/r2*nb02 + j/r3*nb03,
10878+
nb01/ggml_type_size(src0->type),
10879+
(const char *)wdata + (nb12/ggml_type_size(src1->type)*ggml_type_size(vec_dot_type)*i +
10880+
nb13/ggml_type_size(src1->type)*ggml_type_size(vec_dot_type)*j),
10881+
row_size/ggml_type_size(vec_dot_type),
10882+
(char *)dst->data + i*nb2 + j*nb3,
10883+
nb1/ggml_type_size(dst->type),
10884+
ith, nth,
10885+
params->type,
10886+
src0->type,
10887+
vec_dot_type,
10888+
dst->type))
10889+
goto UseGgmlGemm2;
10890+
return;
10891+
}
10892+
UseGgmlGemm2:
10893+
(void)0;
10894+
1085110895
const int64_t nr0 = ne01; // src0 rows
1085210896
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
1085310897

scripts/sync-ggml-am.sh

+4
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
113113
# src/ggml-sycl.h -> ggml-sycl.h
114114
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
115115
# src/ggml-vulkan.h -> ggml-vulkan.h
116+
# src/sgemm.cpp -> sgemm.cpp
117+
# src/sgemm.h -> sgemm.h
116118
# include/ggml/ggml.h -> ggml.h
117119
# include/ggml/ggml-alloc.h -> ggml-alloc.h
118120
# include/ggml/ggml-backend.h -> ggml-backend.h
@@ -147,6 +149,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
147149
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
148150
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
149151
-e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
152+
-e 's/src\/sgemm\.cpp/sgemm.cpp/g' \
153+
-e 's/src\/sgemm\.h/sgemm.h/g' \
150154
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
151155
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
152156
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \

0 commit comments

Comments
 (0)