Skip to content

Commit 71e3718

Browse files
authored
llama : refactor graph build code (#3837)
* llama : factor out ggml-alloc from graph graph build functions ggml-ci * metal : disable kernel load log * llama : factor out tensor offloading outside the build call (wip) ggml-ci * llama : offload rest of the models ggml-ci * llama : update offload log messages to print node index * llama : comments * llama : support offloading result_norm + comments * llama : factor graph input into a function * llama : do tensor offload only with CUDA * llama : fix res_norm offloading * llama : try to optimize offloading code * llama : fix non-CUDA build * llama : try to fix build * llama : move refact in correct place + optimize graph input * llama : refactor tensor offloading as callback * llama : add layer index to all tensor names * llama : add functional header * llama : comment ggml-ci * llama : remove obsolete map for layer counting * llama : add llm_build helper functions (#3848) * llama : add llm_build_norm helper function ggml-ci * llama : add llm_build_ffn helper function (#3849) ggml-ci * llama : add llm_build_k_shift helper ggml-ci * llama : fix offloading after recent changes * llama : add llm_build_kv_store helper ggml-ci * llama : remove obsolete offload names * llama : fix llm_build_k_shift to use n_head_kv instead of n_head * llama : simplify falcon Q, K, V computation * llama : remove obsolete comments in build graphs * llama : add llm_build_kqv helper ggml-ci * llama : minor * llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading * llama : fix input allocation logic * llama : update offload functions for KQ tensors * llama : normalize tensor names ggml-ci * llama : enable warning about not offloaded tensors * llama : remove extra ; + deduplicate gate_b logic * llama : add llm_build_inp_embd helper
1 parent 238657d commit 71e3718

File tree

3 files changed

+1520
-2234
lines changed

3 files changed

+1520
-2234
lines changed

Diff for: ggml-metal.m

+7-4
Original file line numberDiff line numberDiff line change
@@ -238,14 +238,17 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
238238
// load kernels
239239
{
240240
NSError * error = nil;
241-
#define GGML_METAL_ADD_KERNEL(name) \
242-
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
243-
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
241+
242+
/*
244243
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
245244
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
246245
(int) ctx->pipeline_##name.threadExecutionWidth); \
246+
*/
247+
#define GGML_METAL_ADD_KERNEL(name) \
248+
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
249+
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
247250
if (error) { \
248-
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
251+
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
249252
return NULL; \
250253
}
251254

Diff for: ggml.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,7 @@ extern "C" {
709709
// Context tensor enumeration and lookup
710710
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
711711
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
712-
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
712+
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
713713

714714
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
715715
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);

0 commit comments

Comments
 (0)