@@ -1141,7 +1141,6 @@ struct llama_layer {
1141
1141
struct ggml_tensor * wk;
1142
1142
struct ggml_tensor * wv;
1143
1143
struct ggml_tensor * wo;
1144
- struct ggml_tensor * wo_b; //
1145
1144
struct ggml_tensor * wqkv;
1146
1145
1147
1146
// attention bias
@@ -3056,7 +3055,7 @@ static void llm_load_tensors(
3056
3055
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},backend_split);
3057
3056
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3058
3057
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3059
- layer.wo_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3058
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3060
3059
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3061
3060
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3062
3061
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
@@ -3068,7 +3067,7 @@ static void llm_load_tensors(
3068
3067
vram_weights +=
3069
3068
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3070
3069
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3071
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo_b ) +
3070
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo ) +
3072
3071
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
3073
3072
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
3074
3073
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
@@ -3205,7 +3204,7 @@ static void llm_load_tensors(
3205
3204
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},backend_split);
3206
3205
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3207
3206
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3208
- layer.wo_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3207
+ layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3209
3208
// layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3210
3209
// layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3211
3210
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
@@ -3218,7 +3217,7 @@ static void llm_load_tensors(
3218
3217
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
3219
3218
ggml_nbytes(layer.attn_norm_2) + ggml_nbytes(layer.attn_norm_2_b) +
3220
3219
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
3221
- ggml_nbytes(layer.wo) + ggml_nbytes(layer.wo_b ) +
3220
+ ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo ) +
3222
3221
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
3223
3222
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
3224
3223
}
@@ -6348,7 +6347,7 @@ static struct ggml_cgraph * llm_build_gptneox(
6348
6347
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6349
6348
offload_func(cur);
6350
6349
6351
- cur = ggml_add(ctx0, cur, model.layers[il].wo_b );
6350
+ cur = ggml_add(ctx0, cur, model.layers[il].bo );
6352
6351
offload_func(cur);
6353
6352
6354
6353
ggml_set_name(cur, "result_wo");
@@ -6995,7 +6994,7 @@ static struct ggml_cgraph * llm_build_gpt2(
6995
6994
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
6996
6995
offload_func(cur);
6997
6996
6998
- cur = ggml_add(ctx0, cur, model.layers[il].wo_b );
6997
+ cur = ggml_add(ctx0, cur, model.layers[il].bo );
6999
6998
offload_func(cur);
7000
6999
7001
7000
ggml_set_name(cur, "result_wo");
0 commit comments