grantslatton
diff --git a/‎examples/common.cpp
+7-1 b/‎examples/common.cpp
+7-1
diff --git a/‎examples/common.h
+3-2 b/‎examples/common.h
+3-2
diff --git a/‎examples/main/main.cpp
+11 b/‎examples/main/main.cpp
+11
@@ -231,7 +231,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch-size") {
+        } else if (arg == "--grammar") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.token_grammar_path = argv[i];
+        } else if (arg == "-b" || arg == "--batch_size") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
 
@@ -48,8 +48,9 @@ struct gpt_params {
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix      = "";  // string to prefix user inputs with
-    std::string input_suffix      = "";  // string to suffix user inputs with
+    std::string token_grammar_path = "";     // path to file containing serialized token validator
+    std::string input_prefix = "";       // string to prefix user inputs with
+    std::string input_suffix = "";       // string to suffix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
     std::string lora_adapter = "";  // lora adapter path
 
@@ -117,6 +117,15 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
+
+    // load input from params.validator_path
+    std::string token_grammar_path = params.token_grammar_path;
+    void* grammar = nullptr;
+    if (!token_grammar_path.empty()) {
+        fprintf(stderr, "%s: attempting to parse token grammar from '%s'\n", __func__, token_grammar_path.c_str());
+        grammar = llama_load_token_grammar_from_path(token_grammar_path.c_str());
+    }
+
     // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
     // uncomment the "used_mem" line in llama.cpp to see the results
     if (params.mem_test) {
@@ -420,6 +429,7 @@ int main(int argc, char ** argv) {
                 llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
                 // Apply penalties
+                llama_grammar_penalty(ctx, &candidates_p, grammar);
                 float nl_logit = logits[llama_token_nl()];
                 auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                 llama_sample_repetition_penalty(ctx, &candidates_p,
@@ -459,6 +469,7 @@ int main(int argc, char ** argv) {
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
+                llama_grammar_accept_token(ctx, id, grammar);
             }
 
             // replace end of text token with newline token when in interactive mode