Skip to content

Commit 857e735

Browse files
slarenggerganov
authored andcommitted
llama : initial ggml-backend integration (ggml-org#4520)
* llama : initial ggml-backend integration * add ggml-metal * cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set * add ggml_backend_buffer_clear zero-init KV cache buffer * add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data * disable gpu backends with ngl 0 * more accurate mlock * unmap offloaded part of the model * use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap * update quantize and lora * update session copy/set to use ggml-backend ggml-ci * use posix_fadvise instead of posix_fadvise64 * ggml_backend_alloc_ctx_tensors_from_buft : remove old print * llama_mmap::align_offset : use pointers instead of references for out parameters * restore progress_callback behavior * move final progress_callback call to load_all_data * cuda : fix fprintf format string (minor) * do not offload scales * llama_mmap : avoid unmapping the same fragments again in the destructor * remove unnecessary unmap * metal : add default log function that prints to stderr, cleanup code ggml-ci --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 1b122a2 commit 857e735

11 files changed

+925
-751
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ test: $(TEST_TARGETS)
6565
./$$test_target; \
6666
fi; \
6767
if [ $$? -ne 0 ]; then \
68-
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
68+
printf 'Test %s FAILED!\n\n' $$test_target; \
6969
failures=$$(( failures + 1 )); \
7070
else \
7171
printf 'Test %s passed.\n\n' $$test_target; \

ggml-alloc.c

+12-4
Original file line numberDiff line numberDiff line change
@@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
449449
if (update_backend) {
450450
view->backend = view->view_src->backend;
451451
}
452-
view->buffer = view->view_src->buffer;
452+
// views are initialized in the alloc buffer rather than the view_src buffer
453+
view->buffer = alloc->buffer;
453454
view->data = (char *)view->view_src->data + view->view_offs;
454455

455-
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
456-
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
457456
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
458457

459458
if (!alloc->measure) {
@@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
736735
}
737736

738737
void ggml_allocr_free(ggml_allocr_t alloc) {
738+
if (alloc == NULL) {
739+
return;
740+
}
741+
739742
ggml_gallocr_free(alloc->galloc);
740743
ggml_tallocr_free(alloc->talloc);
741744
free(alloc);
@@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
775778
}
776779

777780
if (nbytes == 0) {
778-
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
781+
// all the tensors in the context are already allocated
779782
return NULL;
780783
}
781784

@@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
789792
} else {
790793
ggml_backend_view_init(buffer, t);
791794
}
795+
} else {
796+
if (t->view_src != NULL) {
797+
// view of a pre-allocated tensor
798+
ggml_backend_view_init(buffer, t);
799+
}
792800
}
793801
}
794802

ggml-backend-impl.h

+12-8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ extern "C" {
2020
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
2121
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
2222
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23+
// check if tensor data is in host memory
24+
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
25+
bool (*is_host) (ggml_backend_buffer_type_t buft);
2326
};
2427

2528
struct ggml_backend_buffer_type {
@@ -31,15 +34,16 @@ extern "C" {
3134
typedef void * ggml_backend_buffer_context_t;
3235

3336
struct ggml_backend_buffer_i {
34-
void (*free_buffer)(ggml_backend_buffer_t buffer);
37+
void (*free_buffer) (ggml_backend_buffer_t buffer);
3538
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
36-
void * (*get_base) (ggml_backend_buffer_t buffer);
37-
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38-
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
39-
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
39+
void * (*get_base) (ggml_backend_buffer_t buffer);
40+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
4043
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
41-
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
42-
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
44+
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45+
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
4347
};
4448

4549
struct ggml_backend_buffer {
@@ -78,7 +82,7 @@ extern "C" {
7882
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
7983
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
8084

81-
void (*synchronize) (ggml_backend_t backend);
85+
void (*synchronize)(ggml_backend_t backend);
8286

8387
// compute graph with a plan
8488
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);

ggml-backend.c

+75-5
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
3535
return buft->iface.supports_backend(buft, backend);
3636
}
3737

38+
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
39+
if (buft->iface.is_host) {
40+
return buft->iface.is_host(buft);
41+
}
42+
return false;
43+
}
44+
3845
// backend buffer
3946

4047
ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
94101
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
95102
}
96103

104+
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
105+
buffer->iface.clear(buffer, value);
106+
}
107+
108+
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109+
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110+
}
111+
97112
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
98113
return buffer->buft;
99114
}
@@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
378393

379394
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
380395
free(buffer->context);
381-
GGML_UNUSED(buffer);
382396
}
383397

384398
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
411425
GGML_UNUSED(buffer);
412426
}
413427

428+
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
429+
memset(buffer->context, value, buffer->size);
430+
}
431+
414432
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
415433
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
416434
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
@@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
419437
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
420438
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
421439
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
440+
/* .clear = */ ggml_backend_cpu_buffer_clear,
422441
};
423442

424443
// for buffers from ptr, free is not called
@@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
430449
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
431450
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
432451
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
452+
/* .clear = */ ggml_backend_cpu_buffer_clear,
433453
};
434454

435455
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
455475
GGML_UNUSED(buft);
456476
}
457477

478+
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
479+
return true;
480+
481+
GGML_UNUSED(buft);
482+
}
483+
458484
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
459-
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
485+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
460486
/* .iface = */ {
461487
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
462488
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
463489
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
464490
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
491+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
465492
},
466493
/* .context = */ NULL,
467494
};
468495

469-
return &ggml_backend_buffer_type_cpu;
496+
return &ggml_backend_cpu_buffer_type;
470497
}
471498

499+
#ifdef GGML_USE_CPU_HBM
500+
501+
// buffer type HBM
502+
503+
#include <hbwmalloc.h>
504+
505+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
506+
hbw_free(buffer->context);
507+
}
508+
509+
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
510+
//void * ptr = hbw_malloc(size);
511+
void * ptr;
512+
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
513+
if (result != 0) {
514+
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
515+
return NULL;
516+
}
517+
518+
// FIXME: this is a hack to avoid having to implement a new buffer type
519+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
520+
buffer->buft = buft;
521+
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
522+
523+
return buffer;
524+
}
525+
526+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
527+
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
528+
/* .iface = */ {
529+
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
530+
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
531+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
532+
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
533+
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
534+
},
535+
/* .context = */ NULL,
536+
};
537+
538+
return &ggml_backend_cpu_buffer_type_hbm;
539+
}
540+
#endif
541+
472542
struct ggml_backend_cpu_context {
473543
int n_threads;
474544
void * work_data;
@@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
505575
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
506576

507577
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
508-
cpu_plan->cgraph = *cgraph;
578+
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
509579

510580
if (cpu_plan->cplan.work_size > 0) {
511581
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
11801250
// utils
11811251
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
11821252
GGML_ASSERT(tensor->buffer == NULL);
1183-
GGML_ASSERT(tensor->data == NULL);
1253+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
11841254
GGML_ASSERT(tensor->view_src != NULL);
11851255
GGML_ASSERT(tensor->view_src->buffer != NULL);
11861256
GGML_ASSERT(tensor->view_src->data != NULL);

ggml-backend.h

+7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ extern "C" {
2121
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
2222
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
2323
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
2425

2526
// buffer
2627
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
@@ -29,6 +30,8 @@ extern "C" {
2930
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
3031
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
3132
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
33+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
3235
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
3336

3437
//
@@ -76,6 +79,10 @@ extern "C" {
7679

7780
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
7881

82+
#ifdef GGML_USE_CPU_HBM
83+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
84+
#endif
85+
7986
//
8087
// Backend registry
8188
//

0 commit comments

Comments
 (0)