@@ -117,6 +117,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
117
117
// until thread scheduling is improved, these numbers are around the optimal (for huge batch processing increase -t manually)
118
118
if (params.n_threads > 8 ) params.n_threads = 4 ;
119
119
if (params.n_threads > 4 ) params.n_threads = 2 ;
120
+ params.seed = (int ) time (NULL ); // initiate a seed - we need one if multiple context used with similar input
121
+
120
122
121
123
122
124
for (int i = 1 ; i < argc; i++) {
@@ -338,6 +340,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
338
340
params.interactive_first = true ;
339
341
} else if (arg == " -ins" || arg == " --instruct" ) {
340
342
params.instruct = true ;
343
+ params.interactive = true ;
344
+ params.enclose_finetune = true ;
341
345
} else if (arg == " --multiline-input" ) {
342
346
params.multiline_input = true ;
343
347
} else if (arg == " --color" ) {
@@ -384,7 +388,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
384
388
}
385
389
#ifdef GGML_USE_CUBLAS
386
390
params.mb_reserve_gpu_main = std::stoi (argv[i]);
387
- ggml_cuda_set_vram_reserved (params.mb_reserve_gpu_main * 1024 *1024 );
391
+ ggml_cuda_set_vram_reserved ((( int64_t ) params.mb_reserve_gpu_main )* 1024 *1024 );
388
392
#else
389
393
fprintf (stderr, " warning: falcon.cpp was compiled without cuBLAS. VRAM not available.\n " );
390
394
#endif
@@ -537,19 +541,22 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
537
541
fprintf (stderr, " \n " );
538
542
fprintf (stderr, " options:\n " );
539
543
fprintf (stderr, " -h, --help show this help message and exit\n " );
540
- fprintf (stderr, " -i, --interactive run in interactive mode\n " );
541
- fprintf (stderr, " --interactive-first run in interactive mode and wait for input right away\n " );
542
- fprintf (stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n " );
544
+ fprintf (stderr, " -i, --interactive, -ins \n " );
545
+ fprintf (stderr, " run in interactive chat mode\n " );
546
+ fprintf (stderr, " --interactive-first wait for user input after prompt ingestion\n " );
547
+ // fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
543
548
fprintf (stderr, " -a,--alias,--finetune Set model name alias and optionally force fine-tune type (or disable it)\n " );
544
549
fprintf (stderr, " Finetune options: wizard, falcon-ins, open-assistant, alpaca, none\n " );
545
550
fprintf (stderr, " Use if finetune autodetection does not or wrongly recognizes your model or filename\n " );
546
- fprintf (stderr, " -sys, --system prefix the entire prompt with the system prompt text\n " );
551
+ fprintf (stderr, " -sys, --system <> prefix the entire prompt with the system prompt text\n " );
552
+ fprintf (stderr, " -sysraw, --system-raw treat the system prompt raw (do not add syntax)\n " );
553
+ // fprintf(stderr, " --sys_prompt_simple trust the model to follow the system prompt instead of using evaluated sampling adaption\n");
547
554
fprintf (stderr, " -enc, --enclose enclose the prompt in fine-tune optimal syntax\n " );
548
555
fprintf (stderr, " This automatically chooses the correct syntax to write around your prompt.\n " );
549
556
fprintf (stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\ '\n " );
550
- fprintf (stderr, " -r PROMPT, --reverse-prompt PROMPT\n " );
551
- fprintf (stderr, " halt generation at PROMPT, return control in interactive mode\n " );
552
- fprintf (stderr, " (can be specified more than once for multiple prompts).\n " );
557
+ // fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
558
+ // fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
559
+ // fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
553
560
fprintf (stderr, " --color colorise output to distinguish prompt and user input from generations\n " );
554
561
fprintf (stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n " );
555
562
fprintf (stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
@@ -567,7 +574,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
567
574
fprintf (stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n " );
568
575
fprintf (stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n " );
569
576
fprintf (stderr, " -f FNAME, --file FNAME\n " );
570
- fprintf (stderr, " prompt file to start generation. \n " );
577
+ fprintf (stderr, " read prompt from a file, optionally -p prompt is prefixed \n " );
571
578
fprintf (stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n " , params.n_predict );
572
579
fprintf (stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n " , params.top_k );
573
580
fprintf (stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n " , (double )params.top_p );
@@ -653,8 +660,8 @@ std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std
653
660
654
661
return res;
655
662
}
656
-
657
- struct falcon_context * falcon_init_from_gpt_params ( const gpt_params & params) {
663
+ struct falcon_context_params falcon_context_params_create ( const gpt_params ¶ms)
664
+ {
658
665
auto lparams = falcon_context_default_params ();
659
666
660
667
lparams.n_ctx = params.n_ctx ;
@@ -669,6 +676,12 @@ struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
669
676
lparams.logits_all = params.perplexity ;
670
677
lparams.embedding = params.embedding ;
671
678
679
+ return lparams;
680
+ }
681
+
682
+ struct falcon_context * falcon_init_from_gpt_params (const gpt_params & params) {
683
+
684
+ struct falcon_context_params lparams = falcon_context_params_create (params);
672
685
falcon_context * lctx = falcon_init_from_file (params.model .c_str (), lparams);
673
686
674
687
if (lctx == NULL ) {
0 commit comments