44670
diff --git a/‎.github/workflows/release.yml
+6-6 b/‎.github/workflows/release.yml
+6-6
diff --git a/‎examples/falcon/falcon_main.cpp
+245-86 b/‎examples/falcon/falcon_main.cpp
+245-86
diff --git a/‎examples/falcon_common.cpp
+24-11 b/‎examples/falcon_common.cpp
+24-11
diff --git a/‎examples/falcon_common.h
+8-4 b/‎examples/falcon_common.h
+8-4
diff --git a/‎ggml-cuda.cu
+1-1 b/‎ggml-cuda.cu
+1-1
diff --git a/‎ggml-cuda.h
+2-2 b/‎ggml-cuda.h
+2-2
@@ -243,12 +243,12 @@ jobs:
           & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
           .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
 
-      - name: Test
-        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
-        run: |
-          cd build
-          ctest -C Release --verbose
+      # - name: Test
+      #   id: cmake_test
+      #   if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
+      #   run: |
+      #     cd build
+      #     ctest -C Release --verbose
 
       - name: Get commit hash
         id: commit
 
@@ -117,6 +117,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     // until thread scheduling is improved, these numbers are around the optimal (for huge batch processing increase -t manually)
     if (params.n_threads > 8) params.n_threads = 4;
     if (params.n_threads > 4) params.n_threads = 2;
+    params.seed = (int) time(NULL); // initiate a seed - we need one if multiple context used with similar input
+
 
 
     for (int i = 1; i < argc; i++) {
@@ -338,6 +340,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.interactive_first = true;
         } else if (arg == "-ins" || arg == "--instruct") {
             params.instruct = true;
+            params.interactive = true;
+            params.enclose_finetune = true;
         } else if (arg == "--multiline-input") {
             params.multiline_input = true;
         } else if (arg == "--color") {
@@ -384,7 +388,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             }
             #ifdef GGML_USE_CUBLAS
             params.mb_reserve_gpu_main = std::stoi(argv[i]);
-            ggml_cuda_set_vram_reserved(params.mb_reserve_gpu_main * 1024*1024);
+            ggml_cuda_set_vram_reserved(((int64_t)params.mb_reserve_gpu_main)*1024*1024);
             #else
             fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. VRAM not available.\n");
             #endif
@@ -537,19 +541,22 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
-    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stderr, "  -i, --interactive, -ins \n");
+    fprintf(stderr, "                        run in interactive chat mode\n");
+    fprintf(stderr, "  --interactive-first   wait for user input after prompt ingestion\n");
+    // fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
     fprintf(stderr, "  -a,--alias,--finetune Set model name alias and optionally force fine-tune type (or disable it)\n");
     fprintf(stderr, "                        Finetune options: wizard, falcon-ins, open-assistant, alpaca, none\n");
     fprintf(stderr, "                        Use if finetune autodetection does not or wrongly recognizes your model or filename\n");
-    fprintf(stderr, "  -sys, --system        prefix the entire prompt with the system prompt text\n");
+    fprintf(stderr, "  -sys, --system  <>    prefix the entire prompt with the system prompt text\n");
+    fprintf(stderr, "  -sysraw, --system-raw treat the system prompt raw (do not add syntax)\n");
+    // fprintf(stderr, "  --sys_prompt_simple    trust the model to follow the system prompt instead of using evaluated sampling adaption\n");
     fprintf(stderr, "  -enc, --enclose       enclose the prompt in fine-tune optimal syntax\n");
     fprintf(stderr, "                        This automatically chooses the correct syntax to write around your prompt.\n");
     fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
-    fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
+    // fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
+    // fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
+    // fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
@@ -567,7 +574,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
     fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        prompt file to start generation.\n");
+    fprintf(stderr, "                        read prompt from a file, optionally -p prompt is prefixed\n");
     fprintf(stderr, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
     fprintf(stderr, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
     fprintf(stderr, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
@@ -653,8 +660,8 @@ std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std
 
     return res;
 }
-
-struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
+struct falcon_context_params falcon_context_params_create(const gpt_params &params)
+{
     auto lparams = falcon_context_default_params();
 
     lparams.n_ctx        = params.n_ctx;
@@ -669,6 +676,12 @@ struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
 
+    return lparams;
+}
+
+struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
+    
+    struct falcon_context_params lparams = falcon_context_params_create(params);
     falcon_context * lctx = falcon_init_from_file(params.model.c_str(), lparams);
 
     if (lctx == NULL) {
 
@@ -39,9 +39,9 @@ struct gpt_params {
     // sampling parameters
     std::unordered_map<falcon_token, float> logit_bias; // logit bias for specific tokens
     int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   top_p             = 0.95f; // 1.0 = disabled 
+    float   tfs_z             = 1.00f; // 1.0 = disabled (temperature, frequency, and presence scaling)
+    float   typical_p         = 1.00f; // 1.0 = disabled 
     float   temp              = 0.80f; // 1.0 = disabled
     float   repeat_penalty    = 1.10f; // 1.0 = disabled
     int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
@@ -50,12 +50,14 @@ struct gpt_params {
     int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
+    float   system_prompt_intensity = 0.50f; // -1.0 to +1.0 the intensity of the system prompt (not with simple mode)
 
     std::string model             = "models/7B/ggml-model.bin"; // model path
     std::string model_alias       = "unknown"; // model alias
     t_finetune_type finetune_type = FINETUNE_UNSPECIFIED; // finetune type
     std::string prompt            = "";
     std::string system_prompt     = ""; // optional system prompt for complex finetunes
+    std::string system_baseline_prompt     = ""; // not in use
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
     std::string input_suffix      = "";  // string to suffix user inputs with
@@ -67,6 +69,8 @@ struct gpt_params {
     std::string stopwords  = ""; // comma separated list of stopwords (<|endoftext|> is handled by --ignore-eos)
     bool enclose_finetune  = false; // enclose prompt with correct tokens for finetuned model
     bool sys_prompt_is_raw = false; // The given system prompt will be used without adaptation
+    bool sys_prompt_simple = true; // System prompt is a simple prompt prefix kept in top context instead of the deep eval method (not ready yet)
+
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
@@ -104,7 +108,7 @@ std::vector<falcon_token> falcon_tokenize(struct falcon_context * ctx, const std
 //
 // Model utils
 //
-
+struct falcon_context_params falcon_context_params_create(const gpt_params &params);
 struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params);
 
 //
 
@@ -3167,7 +3167,7 @@ void ggml_cuda_set_main_device(int main_device) {
     // we accept setting it before initialization
     g_system_gpu_status.main_device_id = main_device;
 }
-void ggml_cuda_set_vram_reserved(int vram_reserved_bytes) {
+void ggml_cuda_set_vram_reserved(int64_t vram_reserved_bytes) {
     for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i)
     {
         g_system_gpu_status.device_vram_reserved[i] = vram_reserved_bytes;
 
@@ -21,7 +21,7 @@ typedef struct {
     size_t total_free_vram;
     size_t device_vram_free[GGML_CUDA_MAX_DEVICES];
     size_t device_vram_total[GGML_CUDA_MAX_DEVICES];
-    int device_vram_reserved[GGML_CUDA_MAX_DEVICES]; // overrides reserved vram - may be negative to force vram swapping
+    int64_t device_vram_reserved[GGML_CUDA_MAX_DEVICES]; // overrides reserved vram - may be negative to force vram swapping
     struct cudaDeviceProp device_props[GGML_CUDA_MAX_DEVICES];
 
 } GPUStatus;
@@ -34,7 +34,7 @@ bool   ggml_init_cublas(bool check_only);
 void   ggml_cuda_update_gpu_status(int device_id);
 void   ggml_cuda_print_gpu_status(const GPUStatus *status, bool print_summary);
 void   ggml_cuda_set_max_gpus(int max_gpus);
-void   ggml_cuda_set_vram_reserved(int vram_reserved);
+void   ggml_cuda_set_vram_reserved(int64_t vram_reserved);
 void   ggml_cuda_set_tensor_split_prepare(const float * tensor_split, int num_devices);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
Original file line number	Diff line number	Diff line change
`@@ -3167,7 +3167,7 @@ void ggml_cuda_set_main_device(int main_device) {`
`3167`	`3167`	`// we accept setting it before initialization`
`3168`	`3168`	`g_system_gpu_status.main_device_id = main_device;`
`3169`	`3169`	`}`
`3170`		`-void ggml_cuda_set_vram_reserved(int vram_reserved_bytes) {`
	`3170`	`+void ggml_cuda_set_vram_reserved(int64_t vram_reserved_bytes) {`
`3171`	`3171`	`for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i)`
`3172`	`3172`	`{`
`3173`	`3173`	`g_system_gpu_status.device_vram_reserved[i] = vram_reserved_bytes;`