Skip to content

Commit 1d6e234

Browse files
cmp-nctJohn
and
John
authored
Split loader 1 (ggml-org#53)
* Split of model loader and context mostly similar to how llama.cpp did that already: falcon_context now contains the falcon_model and a pointer to vocab the model contains the vocab now this allows to now conveniently load more than one context as well as loading multiple models at once A ton of changes - if anything acts funky please report * System prompt improvements Structural improvements on libfalcon * updates * Further adapted system prints for all fine tunes --------- Co-authored-by: John <[email protected]>
1 parent c12b2d6 commit 1d6e234

File tree

9 files changed

+557
-296
lines changed

9 files changed

+557
-296
lines changed

.github/workflows/release.yml

+6-6
Original file line numberDiff line numberDiff line change
@@ -243,12 +243,12 @@ jobs:
243243
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
244244
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
245245
246-
- name: Test
247-
id: cmake_test
248-
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
249-
run: |
250-
cd build
251-
ctest -C Release --verbose
246+
# - name: Test
247+
# id: cmake_test
248+
# if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
249+
# run: |
250+
# cd build
251+
# ctest -C Release --verbose
252252

253253
- name: Get commit hash
254254
id: commit

examples/falcon/falcon_main.cpp

+245-86
Large diffs are not rendered by default.

examples/falcon_common.cpp

+24-11
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
117117
// until thread scheduling is improved, these numbers are around the optimal (for huge batch processing increase -t manually)
118118
if (params.n_threads > 8) params.n_threads = 4;
119119
if (params.n_threads > 4) params.n_threads = 2;
120+
params.seed = (int) time(NULL); // initiate a seed - we need one if multiple context used with similar input
121+
120122

121123

122124
for (int i = 1; i < argc; i++) {
@@ -338,6 +340,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
338340
params.interactive_first = true;
339341
} else if (arg == "-ins" || arg == "--instruct") {
340342
params.instruct = true;
343+
params.interactive = true;
344+
params.enclose_finetune = true;
341345
} else if (arg == "--multiline-input") {
342346
params.multiline_input = true;
343347
} else if (arg == "--color") {
@@ -384,7 +388,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
384388
}
385389
#ifdef GGML_USE_CUBLAS
386390
params.mb_reserve_gpu_main = std::stoi(argv[i]);
387-
ggml_cuda_set_vram_reserved(params.mb_reserve_gpu_main * 1024*1024);
391+
ggml_cuda_set_vram_reserved(((int64_t)params.mb_reserve_gpu_main)*1024*1024);
388392
#else
389393
fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. VRAM not available.\n");
390394
#endif
@@ -537,19 +541,22 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
537541
fprintf(stderr, "\n");
538542
fprintf(stderr, "options:\n");
539543
fprintf(stderr, " -h, --help show this help message and exit\n");
540-
fprintf(stderr, " -i, --interactive run in interactive mode\n");
541-
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
542-
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
544+
fprintf(stderr, " -i, --interactive, -ins \n");
545+
fprintf(stderr, " run in interactive chat mode\n");
546+
fprintf(stderr, " --interactive-first wait for user input after prompt ingestion\n");
547+
// fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
543548
fprintf(stderr, " -a,--alias,--finetune Set model name alias and optionally force fine-tune type (or disable it)\n");
544549
fprintf(stderr, " Finetune options: wizard, falcon-ins, open-assistant, alpaca, none\n");
545550
fprintf(stderr, " Use if finetune autodetection does not or wrongly recognizes your model or filename\n");
546-
fprintf(stderr, " -sys, --system prefix the entire prompt with the system prompt text\n");
551+
fprintf(stderr, " -sys, --system <> prefix the entire prompt with the system prompt text\n");
552+
fprintf(stderr, " -sysraw, --system-raw treat the system prompt raw (do not add syntax)\n");
553+
// fprintf(stderr, " --sys_prompt_simple trust the model to follow the system prompt instead of using evaluated sampling adaption\n");
547554
fprintf(stderr, " -enc, --enclose enclose the prompt in fine-tune optimal syntax\n");
548555
fprintf(stderr, " This automatically chooses the correct syntax to write around your prompt.\n");
549556
fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
550-
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
551-
fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
552-
fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
557+
// fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
558+
// fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
559+
// fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
553560
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
554561
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
555562
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
@@ -567,7 +574,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
567574
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
568575
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
569576
fprintf(stderr, " -f FNAME, --file FNAME\n");
570-
fprintf(stderr, " prompt file to start generation.\n");
577+
fprintf(stderr, " read prompt from a file, optionally -p prompt is prefixed\n");
571578
fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
572579
fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
573580
fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
@@ -653,8 +660,8 @@ std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std
653660

654661
return res;
655662
}
656-
657-
struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
663+
struct falcon_context_params falcon_context_params_create(const gpt_params &params)
664+
{
658665
auto lparams = falcon_context_default_params();
659666

660667
lparams.n_ctx = params.n_ctx;
@@ -669,6 +676,12 @@ struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
669676
lparams.logits_all = params.perplexity;
670677
lparams.embedding = params.embedding;
671678

679+
return lparams;
680+
}
681+
682+
struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
683+
684+
struct falcon_context_params lparams = falcon_context_params_create(params);
672685
falcon_context * lctx = falcon_init_from_file(params.model.c_str(), lparams);
673686

674687
if (lctx == NULL) {

examples/falcon_common.h

+8-4
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ struct gpt_params {
3939
// sampling parameters
4040
std::unordered_map<falcon_token, float> logit_bias; // logit bias for specific tokens
4141
int32_t top_k = 40; // <= 0 to use vocab size
42-
float top_p = 0.95f; // 1.0 = disabled
43-
float tfs_z = 1.00f; // 1.0 = disabled
44-
float typical_p = 1.00f; // 1.0 = disabled
42+
float top_p = 0.95f; // 1.0 = disabled
43+
float tfs_z = 1.00f; // 1.0 = disabled (temperature, frequency, and presence scaling)
44+
float typical_p = 1.00f; // 1.0 = disabled
4545
float temp = 0.80f; // 1.0 = disabled
4646
float repeat_penalty = 1.10f; // 1.0 = disabled
4747
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -50,12 +50,14 @@ struct gpt_params {
5050
int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
5151
float mirostat_tau = 5.00f; // target entropy
5252
float mirostat_eta = 0.10f; // learning rate
53+
float system_prompt_intensity = 0.50f; // -1.0 to +1.0 the intensity of the system prompt (not with simple mode)
5354

5455
std::string model = "models/7B/ggml-model.bin"; // model path
5556
std::string model_alias = "unknown"; // model alias
5657
t_finetune_type finetune_type = FINETUNE_UNSPECIFIED; // finetune type
5758
std::string prompt = "";
5859
std::string system_prompt = ""; // optional system prompt for complex finetunes
60+
std::string system_baseline_prompt = ""; // not in use
5961
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
6062
std::string input_prefix = ""; // string to prefix user inputs with
6163
std::string input_suffix = ""; // string to suffix user inputs with
@@ -67,6 +69,8 @@ struct gpt_params {
6769
std::string stopwords = ""; // comma separated list of stopwords (<|endoftext|> is handled by --ignore-eos)
6870
bool enclose_finetune = false; // enclose prompt with correct tokens for finetuned model
6971
bool sys_prompt_is_raw = false; // The given system prompt will be used without adaptation
72+
bool sys_prompt_simple = true; // System prompt is a simple prompt prefix kept in top context instead of the deep eval method (not ready yet)
73+
7074
bool memory_f16 = true; // use f16 instead of f32 for memory kv
7175
bool random_prompt = false; // do not randomize prompt if none provided
7276
bool use_color = false; // use color to distinguish generations and inputs
@@ -104,7 +108,7 @@ std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std
104108
//
105109
// Model utils
106110
//
107-
111+
struct falcon_context_params falcon_context_params_create(const gpt_params &params);
108112
struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params);
109113

110114
//

ggml-cuda.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -3167,7 +3167,7 @@ void ggml_cuda_set_main_device(int main_device) {
31673167
// we accept setting it before initialization
31683168
g_system_gpu_status.main_device_id = main_device;
31693169
}
3170-
void ggml_cuda_set_vram_reserved(int vram_reserved_bytes) {
3170+
void ggml_cuda_set_vram_reserved(int64_t vram_reserved_bytes) {
31713171
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i)
31723172
{
31733173
g_system_gpu_status.device_vram_reserved[i] = vram_reserved_bytes;

ggml-cuda.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ typedef struct {
2121
size_t total_free_vram;
2222
size_t device_vram_free[GGML_CUDA_MAX_DEVICES];
2323
size_t device_vram_total[GGML_CUDA_MAX_DEVICES];
24-
int device_vram_reserved[GGML_CUDA_MAX_DEVICES]; // overrides reserved vram - may be negative to force vram swapping
24+
int64_t device_vram_reserved[GGML_CUDA_MAX_DEVICES]; // overrides reserved vram - may be negative to force vram swapping
2525
struct cudaDeviceProp device_props[GGML_CUDA_MAX_DEVICES];
2626

2727
} GPUStatus;
@@ -34,7 +34,7 @@ bool ggml_init_cublas(bool check_only);
3434
void ggml_cuda_update_gpu_status(int device_id);
3535
void ggml_cuda_print_gpu_status(const GPUStatus *status, bool print_summary);
3636
void ggml_cuda_set_max_gpus(int max_gpus);
37-
void ggml_cuda_set_vram_reserved(int vram_reserved);
37+
void ggml_cuda_set_vram_reserved(int64_t vram_reserved);
3838
void ggml_cuda_set_tensor_split_prepare(const float * tensor_split, int num_devices);
3939
void ggml_cuda_set_tensor_split(const float * tensor_split);
4040

0 commit comments

Comments
 (0)