Skip to content

Commit e0213e0

Browse files
committed
Some random ctx/eps stuff I pulled out from my fork
1 parent 2a20f48 commit e0213e0

File tree

6 files changed

+60
-34
lines changed

6 files changed

+60
-34
lines changed

Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ endif
3030
# Compile flags
3131
#
3232

33-
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34-
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
35-
LDFLAGS =
33+
CFLAGS = -I. -O2 -DNDEBUG -std=c11 -flto -fPIC
34+
CXXFLAGS = -I. -I./examples -O2 -DNDEBUG -std=c++11 -flto -fPIC
35+
LDFLAGS = -flto -fPIC
3636

3737
# OS specific
3838
# TODO: support Windows

convert-pth-to-ggml.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,6 @@ def get_n_parts(dim):
127127
name = k
128128
shape = v.shape
129129

130-
# skip layers.X.attention.inner_attention.rope.freqs
131-
if name[-5:] == "freqs":
132-
continue
133-
134130
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
135131

136132
#data = tf.train.load_variable(dir_model, name).squeeze()
@@ -169,7 +165,7 @@ def get_n_parts(dim):
169165
data.tofile(fout)
170166

171167
# I hope this deallocates the memory ..
172-
model = None
168+
del model
173169

174170
fout.close()
175171

ggml.c

+5-21
Original file line numberDiff line numberDiff line change
@@ -2145,6 +2145,7 @@ struct ggml_context {
21452145
bool mem_buffer_owned;
21462146

21472147
int n_objects;
2148+
float_t eps;
21482149

21492150
struct ggml_object * objects_begin;
21502151
struct ggml_object * objects_end;
@@ -2159,26 +2160,6 @@ struct ggml_context_container {
21592160
struct ggml_context context;
21602161
};
21612162

2162-
//
2163-
// compute types
2164-
//
2165-
2166-
enum ggml_task_type {
2167-
GGML_TASK_INIT = 0,
2168-
GGML_TASK_COMPUTE,
2169-
GGML_TASK_FINALIZE,
2170-
};
2171-
2172-
struct ggml_compute_params {
2173-
enum ggml_task_type type;
2174-
2175-
int ith, nth;
2176-
2177-
// work buffer for all threads
2178-
size_t wsize;
2179-
void * wdata;
2180-
};
2181-
21822163
//
21832164
// ggml state
21842165
//
@@ -2422,6 +2403,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
24222403
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
24232404
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
24242405
/*.n_objects =*/ 0,
2406+
/*.eps =*/ params.eps,
24252407
/*.objects_begin =*/ NULL,
24262408
/*.objects_end =*/ NULL,
24272409
/*.scratch =*/ { 0, 0, NULL, },
@@ -5335,7 +5317,8 @@ static void ggml_compute_forward_norm_f32(
53355317
const size_t nb2 = dst->nb[2];
53365318
const size_t nb3 = dst->nb[3];
53375319

5338-
const ggml_float eps = 1e-5f; // TODO: make this a parameter
5320+
// if params->eps is zero, use default of 1e-6 otherwise use params->eps
5321+
const float eps = params->eps == 0.0f ? 1e-6f : params->eps;
53395322

53405323
// TODO: optimize
53415324
for (int i03 = 0; i03 < ne03; i03++) {
@@ -9378,6 +9361,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
93789361
/*.type =*/ GGML_TASK_INIT,
93799362
/*.ith =*/ 0,
93809363
/*.nth =*/ node->n_tasks,
9364+
/*.eps =*/ ctx->eps,
93819365
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
93829366
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
93839367
};

ggml.h

+33-1
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,36 @@ struct ggml_scratch {
314314
struct ggml_init_params {
315315
// memory pool
316316
size_t mem_size; // bytes
317+
318+
// eps
319+
float eps;
320+
321+
// work buffer
317322
void * mem_buffer; // if NULL, memory will be allocated internally
318323
};
319324

325+
//
326+
// compute types
327+
//
328+
329+
enum ggml_task_type {
330+
GGML_TASK_INIT = 0,
331+
GGML_TASK_COMPUTE,
332+
GGML_TASK_FINALIZE,
333+
};
334+
335+
struct ggml_compute_params {
336+
enum ggml_task_type type;
337+
338+
int ith, nth;
339+
340+
float eps;
341+
342+
// work buffer for all threads
343+
size_t wsize;
344+
void * wdata;
345+
};
346+
320347
void ggml_time_init(void); // call this once at the beginning of the program
321348
int64_t ggml_time_ms(void);
322349
int64_t ggml_time_us(void);
@@ -477,7 +504,6 @@ struct ggml_tensor * ggml_silu(
477504
struct ggml_tensor * a);
478505

479506
// normalize along rows
480-
// TODO: eps is hardcoded to 1e-5 for now
481507
struct ggml_tensor * ggml_norm(
482508
struct ggml_context * ctx,
483509
struct ggml_tensor * a);
@@ -585,6 +611,12 @@ struct ggml_tensor * ggml_rope(
585611
int n_past,
586612
int n_dims,
587613
int mode);
614+
615+
static void ggml_compute_forward_rope(
616+
const struct ggml_compute_params * params,
617+
const struct ggml_tensor * src0,
618+
const struct ggml_tensor * src1,
619+
struct ggml_tensor * dst);
588620

589621
// padding = 1
590622
// TODO: we don't support extra parameters for now

main.cpp

+17-3
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct llama_layer {
6262
struct ggml_tensor * w1;
6363
struct ggml_tensor * w2;
6464
struct ggml_tensor * w3;
65+
6566
};
6667

6768
struct llama_model {
@@ -72,6 +73,9 @@ struct llama_model {
7273
struct ggml_tensor * norm;
7374
struct ggml_tensor * output;
7475

76+
// rope frequencies
77+
struct ggml_tensor * rope_freqs;
78+
7579
std::vector<llama_layer> layers;
7680

7781
// key + value memory
@@ -215,7 +219,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
215219
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
216220
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
217221

218-
ctx_size += (5 + 10*n_layer)*256; // object overhead
222+
ctx_size += (5 + 10*n_layer)*hparams.n_ctx; // object overhead
219223

220224
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
221225
}
@@ -224,6 +228,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
224228
{
225229
struct ggml_init_params params = {
226230
/*.mem_size =*/ ctx_size,
231+
/*.eps =*/ 1e-6, // change to 1e-5 for 7/13B models
227232
/*.mem_buffer =*/ NULL,
228233
};
229234

@@ -286,6 +291,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
286291
model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
287292
model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
288293
}
294+
295+
model.rope_freqs = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
296+
model.tensors["rope.freqs"] = model.rope_freqs;
297+
289298
}
290299

291300
// key + value memory
@@ -543,7 +552,8 @@ bool llama_eval(
543552

544553
const int d_key = n_embd/n_head;
545554

546-
static size_t buf_size = 512u*1024*1024;
555+
// allocate memory
556+
static size_t buf_size = n_ctx*1024*1024;
547557
static void * buf = malloc(buf_size);
548558

549559
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
@@ -561,6 +571,7 @@ bool llama_eval(
561571

562572
struct ggml_init_params params = {
563573
/*.mem_size =*/ buf_size,
574+
/*.eps =*/ 1e-6, // change to 1e-5 for for 7/13B models
564575
/*.mem_buffer =*/ buf,
565576
};
566577

@@ -603,6 +614,7 @@ bool llama_eval(
603614
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
604615
}
605616

617+
// Apply rotary embeddings
606618
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
607619
struct ggml_tensor * Q =
608620
ggml_permute(ctx0,
@@ -633,6 +645,7 @@ bool llama_eval(
633645
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
634646
);
635647

648+
// Scoring
636649
// KQ_masked = mask_past(KQ_scaled)
637650
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
638651

@@ -658,6 +671,7 @@ bool llama_eval(
658671
KQV_merged,
659672
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
660673

674+
// attention.wo(output)
661675
// projection (no bias)
662676
cur = ggml_mul_mat(ctx0,
663677
model.layers[il].wo,
@@ -795,7 +809,7 @@ int main(int argc, char ** argv) {
795809
{
796810
const int64_t t_start_us = ggml_time_us();
797811

798-
if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ??
812+
if (!llama_model_load(params.model, model, vocab, 1024)) { // TODO: set context from user input ??
799813
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
800814
return 1;
801815
}

quantize.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
299299

300300
// needed to initialize f16 tables
301301
{
302-
struct ggml_init_params params = { 0, NULL };
302+
struct ggml_init_params params = { 0, 1e-6, NULL };
303303
struct ggml_context * ctx = ggml_init(params);
304304
ggml_free(ctx);
305305
}

0 commit comments

Comments
 (0)