@@ -2991,7 +2991,7 @@ static void llm_load_tensors(
2991
2991
} break;
2992
2992
case LLM_ARCH_STABLELM:
2993
2993
{
2994
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2994
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2995
2995
2996
2996
// output
2997
2997
{
@@ -3002,12 +3002,12 @@ static void llm_load_tensors(
3002
3002
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3003
3003
// on Windows however this is detrimental unless everything is on the GPU
3004
3004
#ifndef _WIN32
3005
- backend_norm = llama_backend_offload ;
3005
+ backend_norm = GGML_BACKEND_GPU ;
3006
3006
#else
3007
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
3007
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : GGML_BACKEND_GPU ;
3008
3008
#endif // _WIN32
3009
3009
3010
- backend_output = llama_backend_offload_split ;
3010
+ backend_output = GGML_BACKEND_GPU_SPLIT ;
3011
3011
} else {
3012
3012
backend_norm = GGML_BACKEND_CPU;
3013
3013
backend_output = GGML_BACKEND_CPU;
@@ -3035,8 +3035,8 @@ static void llm_load_tensors(
3035
3035
/*
3036
3036
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3037
3037
*/
3038
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
3039
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
3038
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_GPU ; // NOLINT
3039
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_GPU_SPLIT ; // NOLINT
3040
3040
3041
3041
auto & layer = model.layers[i];
3042
3042
@@ -3051,15 +3051,15 @@ static void llm_load_tensors(
3051
3051
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3052
3052
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3053
3053
3054
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3055
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3056
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3054
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3055
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3056
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3057
3057
3058
3058
if (backend == GGML_BACKEND_GPU) {
3059
3059
vram_weights +=
3060
3060
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
3061
3061
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
3062
- ggml_nbytes(layer.ffn_gate ) + ggml_nbytes(layer.ffn_down ) + ggml_nbytes(layer.ffn_up );
3062
+ ggml_nbytes(layer.w1 ) + ggml_nbytes(layer.w2 ) + ggml_nbytes(layer.w3 );
3063
3063
}
3064
3064
}
3065
3065
} break;
@@ -5943,7 +5943,7 @@ struct ggml_cgraph * build_stablelm() {
5943
5943
struct ggml_tensor * cur;
5944
5944
struct ggml_tensor * inpL;
5945
5945
5946
- inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd , cb);
5946
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embeddings , cb);
5947
5947
cb(inpL, "inp_embd", -1);
5948
5948
5949
5949
// inp_pos - contains the positions
@@ -6076,9 +6076,9 @@ struct ggml_cgraph * build_stablelm() {
6076
6076
cb(cur, "ffn_norm", il);
6077
6077
6078
6078
cur = llm_build_ffn(ctx0, cur,
6079
- model.layers[il].ffn_up, NULL,
6080
- model.layers[il].ffn_gate , NULL,
6081
- model.layers[il].ffn_down , NULL,
6079
+ model.layers[il].w3, NULL,
6080
+ model.layers[il].w1 , NULL,
6081
+ model.layers[il].w2 , NULL,
6082
6082
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6083
6083
cb(cur, "ffn_out", il);
6084
6084
}
0 commit comments