Update llama.cpp

abetlen · abetlen · commit d808fd436c2f · 2023-10-31T21:29:35.000-04:00
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -243,6 +243,7 @@ class llama_token_data_array(Structure):
 #     llama_token  *  token;
 #     float        *  embd;
 #     llama_pos    *  pos;
+#     int32_t      *  n_seq_id;
 #     llama_seq_id ** seq_id;
 #     int8_t       *  logits;
 
@@ -262,6 +263,7 @@ class llama_batch(Structure):
         ("token", POINTER(llama_token)),
         ("embd", c_float_p),
         ("pos", POINTER(llama_pos)),
+        ("n_seq_id", POINTER(c_int32)),
         ("seq_id", POINTER(POINTER(llama_seq_id))),
         ("logits", POINTER(c_int8)),
         ("all_pos_0", llama_pos),
@@ -312,7 +314,7 @@ class llama_model_params(Structure):
 
 
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+#     bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
 #     bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
 #     bool logits_all; // the llama_eval() call computes all logits, not just the last one
 #     bool embedding;  // embedding mode only
@@ -349,6 +351,7 @@ class llama_context_params(Structure):
 #     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
 #     bool quantize_output_tensor; // quantize output.weight
 #     bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+#     bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
 # } llama_model_quantize_params;
 class llama_model_quantize_params(Structure):
     _fields_ = [
@@ -777,26 +780,21 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
 _lib.llama_get_kv_cache_token_count.restype = c_int
 
 
-# // Remove all tokens data of cells in [c0, c1)
-# // c0 < 0 : [0,  c1]
-# // c1 < 0 : [c0, inf)
-# LLAMA_API void llama_kv_cache_tokens_rm(
-#         struct llama_context * ctx,
-#                      int32_t   c0,
-#                      int32_t   c1);
-def llama_kv_cache_tokens_rm(
-    ctx: llama_context_p, c0: Union[c_int32, int], c1: Union[c_int32, int]
-):
-    return _lib.llama_kv_cache_tokens_rm(ctx, c0, c1)
+# // Clear the KV cache
+# LLAMA_API void llama_kv_cache_clear(
+#         struct llama_context * ctx);
+def llama_kv_cache_clear(ctx: llama_context_p):
+    return _lib.llama_kv_cache_clear(ctx)
 
 
-_lib.llama_kv_cache_tokens_rm.argtypes = [llama_context_p, c_int32, c_int32]
-_lib.llama_kv_cache_tokens_rm.restype = None
+_lib.llama_kv_cache_clear.argtypes = [llama_context_p]
+_lib.llama_kv_cache_clear.restype = None
 
 
 # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-# // p0 < 0 : [0,  p1]
-# // p1 < 0 : [p0, inf)
+# // seq_id < 0 : match any sequence
+# // p0 < 0     : [0,  p1]
+# // p1 < 0     : [p0, inf)
 # LLAMA_API void llama_kv_cache_seq_rm(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
@@ -1502,7 +1500,7 @@ def llama_sample_classifier_free_guidance(
 _lib.llama_sample_classifier_free_guidance.restype = None
 
 
-# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # LLAMA_API void llama_sample_softmax(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates);
@@ -1519,7 +1517,7 @@ def llama_sample_softmax(
 _lib.llama_sample_softmax.restype = None
 
 
-# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 # LLAMA_API void llama_sample_top_k(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
@@ -1543,7 +1541,7 @@ def llama_sample_top_k(
 _lib.llama_sample_top_k.restype = None
 
 
-# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 # LLAMA_API void llama_sample_top_p(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
@@ -1567,7 +1565,31 @@ def llama_sample_top_p(
 _lib.llama_sample_top_p.restype = None
 
 
-# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+# LLAMA_API void llama_sample_min_p(
+#         struct llama_context * ctx,
+#       llama_token_data_array * candidates,
+#                        float   p,
+#                       size_t   min_keep);
+def llama_sample_min_p(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    p: Union[c_float, float],
+    min_keep: Union[c_size_t, int],
+):
+    return _lib.llama_sample_min_p(ctx, candidates, p, min_keep)
+
+
+_lib.llama_sample_min_p.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.llama_sample_min_p.restype = None
+
+
+# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 # LLAMA_API void llama_sample_tail_free(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
@@ -1591,7 +1613,7 @@ def llama_sample_tail_free(
 _lib.llama_sample_tail_free.restype = None
 
 
-# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+# /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 # LLAMA_API void llama_sample_typical(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
@@ -1656,7 +1678,11 @@ def llama_sample_temperature(
 _lib.llama_sample_temperature.restype = None
 
 
-# LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
+# /// @details Apply constraints from grammar
+# LLAMA_API void llama_sample_grammar(
+#         struct llama_context * ctx,
+#       llama_token_data_array * candidates,
+#   const struct llama_grammar * grammar);
 def llama_sample_grammar(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -1673,12 +1699,12 @@ def llama_sample_grammar(
 _lib.llama_sample_grammar.restype = None
 
 
-# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 # LLAMA_API llama_token llama_sample_token_mirostat(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
@@ -1708,11 +1734,11 @@ def llama_sample_token_mirostat(
 _lib.llama_sample_token_mirostat.restype = llama_token
 
 
-# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 # LLAMA_API llama_token llama_sample_token_mirostat_v2(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates,
@@ -1739,7 +1765,8 @@ def llama_sample_token_mirostat_v2(
 _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 
-# @details Selects the token with the highest probability.
+# /// @details Selects the token with the highest probability.
+# ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
 # LLAMA_API llama_token llama_sample_token_greedy(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates);
@@ -1757,7 +1784,7 @@ def llama_sample_token_greedy(
 _lib.llama_sample_token_greedy.restype = llama_token
 
 
-# @details Randomly selects a token from the candidates based on their probabilities.
+# /// @details Randomly selects a token from the candidates based on their probabilities.
 # LLAMA_API llama_token llama_sample_token(
 #         struct llama_context * ctx,
 #       llama_token_data_array * candidates);
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e3932593d46c30145301a13097895f9376cba509
+Subproject commit 238657db2364cfb728c694470a4a81702afea760

Original file line number	Original file line	Diff line number	Diff line change
`@@ -243,6 +243,7 @@ class llama_token_data_array(Structure):`
`243`	`# llama_token * token;`	`243`	`# llama_token * token;`
`244`	`# float * embd;`	`244`	`# float * embd;`
`245`	`# llama_pos * pos;`	`245`	`# llama_pos * pos;`
		`246`	`+# int32_t * n_seq_id;`
`246`	`# llama_seq_id ** seq_id;`	`247`	`# llama_seq_id ** seq_id;`
`247`	`# int8_t * logits;`	`248`	`# int8_t * logits;`
`248`		`249`
`@@ -262,6 +263,7 @@ class llama_batch(Structure):`
`262`	`("token", POINTER(llama_token)),`	`263`	`("token", POINTER(llama_token)),`
`263`	`("embd", c_float_p),`	`264`	`("embd", c_float_p),`
`264`	`("pos", POINTER(llama_pos)),`	`265`	`("pos", POINTER(llama_pos)),`
		`266`	`+ ("n_seq_id", POINTER(c_int32)),`
`265`	`("seq_id", POINTER(POINTER(llama_seq_id))),`	`267`	`("seq_id", POINTER(POINTER(llama_seq_id))),`
`266`	`("logits", POINTER(c_int8)),`	`268`	`("logits", POINTER(c_int8)),`
`267`	`("all_pos_0", llama_pos),`	`269`	`("all_pos_0", llama_pos),`
`@@ -312,7 +314,7 @@ class llama_model_params(Structure):`
`312`		`314`
`313`		`315`
`314`	`# // Keep the booleans together to avoid misalignment during copy-by-value.`	`316`	`# // Keep the booleans together to avoid misalignment during copy-by-value.`
`315`	`-# bool mul_mat_q; // if true, use experimental mul_mat_q kernels`	`317`	`+# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)`
`316`	`# bool f16_kv; // use fp16 for KV cache, fp32 otherwise`	`318`	`# bool f16_kv; // use fp16 for KV cache, fp32 otherwise`
`317`	`# bool logits_all; // the llama_eval() call computes all logits, not just the last one`	`319`	`# bool logits_all; // the llama_eval() call computes all logits, not just the last one`
`318`	`# bool embedding; // embedding mode only`	`320`	`# bool embedding; // embedding mode only`
`@@ -349,6 +351,7 @@ class llama_context_params(Structure):`
`349`	`# bool allow_requantize; // allow quantizing non-f32/f16 tensors`	`351`	`# bool allow_requantize; // allow quantizing non-f32/f16 tensors`
`350`	`# bool quantize_output_tensor; // quantize output.weight`	`352`	`# bool quantize_output_tensor; // quantize output.weight`
`351`	`# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored`	`353`	`# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored`
		`354`	`+# bool pure; // disable k-quant mixtures and quantize all tensors to the same type`
`352`	`# } llama_model_quantize_params;`	`355`	`# } llama_model_quantize_params;`
`353`	`class llama_model_quantize_params(Structure):`	`356`	`class llama_model_quantize_params(Structure):`
`354`	`_fields_ = [`	`357`	`_fields_ = [`
`@@ -777,26 +780,21 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:`
`777`	`_lib.llama_get_kv_cache_token_count.restype = c_int`	`780`	`_lib.llama_get_kv_cache_token_count.restype = c_int`
`778`		`781`
`779`		`782`
`780`	`-# // Remove all tokens data of cells in [c0, c1)`	`783`	`+# // Clear the KV cache`
`781`	`-# // c0 < 0 : [0, c1]`	`784`	`+# LLAMA_API void llama_kv_cache_clear(`
`782`	`-# // c1 < 0 : [c0, inf)`	`785`	`+# struct llama_context * ctx);`
`783`	`-# LLAMA_API void llama_kv_cache_tokens_rm(`	`786`	`+def llama_kv_cache_clear(ctx: llama_context_p):`
`784`	`-# struct llama_context * ctx,`	`787`	`+ return _lib.llama_kv_cache_clear(ctx)`
`785`	`-# int32_t c0,`
`786`	`-# int32_t c1);`
`787`	`-def llama_kv_cache_tokens_rm(`
`788`	`- ctx: llama_context_p, c0: Union[c_int32, int], c1: Union[c_int32, int]`
`789`	`-):`
`790`	`- return _lib.llama_kv_cache_tokens_rm(ctx, c0, c1)`
`791`		`788`
`792`		`789`
`793`	`-_lib.llama_kv_cache_tokens_rm.argtypes = [llama_context_p, c_int32, c_int32]`	`790`	`+_lib.llama_kv_cache_clear.argtypes = [llama_context_p]`
`794`	`-_lib.llama_kv_cache_tokens_rm.restype = None`	`791`	`+_lib.llama_kv_cache_clear.restype = None`
`795`		`792`
`796`		`793`
`797`	`# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)`	`794`	`# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)`
`798`	`-# // p0 < 0 : [0, p1]`	`795`	`+# // seq_id < 0 : match any sequence`
`799`	`-# // p1 < 0 : [p0, inf)`	`796`	`+# // p0 < 0 : [0, p1]`
		`797`	`+# // p1 < 0 : [p0, inf)`
`800`	`# LLAMA_API void llama_kv_cache_seq_rm(`	`798`	`# LLAMA_API void llama_kv_cache_seq_rm(`
`801`	`# struct llama_context * ctx,`	`799`	`# struct llama_context * ctx,`
`802`	`# llama_seq_id seq_id,`	`800`	`# llama_seq_id seq_id,`
`@@ -1502,7 +1500,7 @@ def llama_sample_classifier_free_guidance(`
`1502`	`_lib.llama_sample_classifier_free_guidance.restype = None`	`1500`	`_lib.llama_sample_classifier_free_guidance.restype = None`
`1503`		`1501`
`1504`		`1502`
`1505`	`-# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.`	`1503`	`+# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.`
`1506`	`# LLAMA_API void llama_sample_softmax(`	`1504`	`# LLAMA_API void llama_sample_softmax(`
`1507`	`# struct llama_context * ctx,`	`1505`	`# struct llama_context * ctx,`
`1508`	`# llama_token_data_array * candidates);`	`1506`	`# llama_token_data_array * candidates);`
`@@ -1519,7 +1517,7 @@ def llama_sample_softmax(`
`1519`	`_lib.llama_sample_softmax.restype = None`	`1517`	`_lib.llama_sample_softmax.restype = None`
`1520`		`1518`
`1521`		`1519`
`1522`	`-# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751`	`1520`	`+# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751`
`1523`	`# LLAMA_API void llama_sample_top_k(`	`1521`	`# LLAMA_API void llama_sample_top_k(`
`1524`	`# struct llama_context * ctx,`	`1522`	`# struct llama_context * ctx,`
`1525`	`# llama_token_data_array * candidates,`	`1523`	`# llama_token_data_array * candidates,`
`@@ -1543,7 +1541,7 @@ def llama_sample_top_k(`
`1543`	`_lib.llama_sample_top_k.restype = None`	`1541`	`_lib.llama_sample_top_k.restype = None`
`1544`		`1542`
`1545`		`1543`
`1546`	`-# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751`	`1544`	`+# /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751`
`1547`	`# LLAMA_API void llama_sample_top_p(`	`1545`	`# LLAMA_API void llama_sample_top_p(`
`1548`	`# struct llama_context * ctx,`	`1546`	`# struct llama_context * ctx,`
`1549`	`# llama_token_data_array * candidates,`	`1547`	`# llama_token_data_array * candidates,`
`@@ -1567,7 +1565,31 @@ def llama_sample_top_p(`
`1567`	`_lib.llama_sample_top_p.restype = None`	`1565`	`_lib.llama_sample_top_p.restype = None`
`1568`		`1566`
`1569`		`1567`
`1570`	`-# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.`	`1568`	`+# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841`
		`1569`	`+# LLAMA_API void llama_sample_min_p(`
		`1570`	`+# struct llama_context * ctx,`
		`1571`	`+# llama_token_data_array * candidates,`
		`1572`	`+# float p,`
		`1573`	`+# size_t min_keep);`
		`1574`	`+def llama_sample_min_p(`
		`1575`	`+ ctx: llama_context_p,`
		`1576`	`+ candidates, # type: _Pointer[llama_token_data_array]`
		`1577`	`+ p: Union[c_float, float],`
		`1578`	`+ min_keep: Union[c_size_t, int],`
		`1579`	`+):`
		`1580`	`+ return _lib.llama_sample_min_p(ctx, candidates, p, min_keep)`
		`1581`	`+`
		`1582`	`+`
		`1583`	`+_lib.llama_sample_min_p.argtypes = [`
		`1584`	`+ llama_context_p,`
		`1585`	`+ llama_token_data_array_p,`
		`1586`	`+ c_float,`
		`1587`	`+ c_size_t,`
		`1588`	`+]`
		`1589`	`+_lib.llama_sample_min_p.restype = None`
		`1590`	`+`
		`1591`	`+`
		`1592`	`+# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.`
`1571`	`# LLAMA_API void llama_sample_tail_free(`	`1593`	`# LLAMA_API void llama_sample_tail_free(`
`1572`	`# struct llama_context * ctx,`	`1594`	`# struct llama_context * ctx,`
`1573`	`# llama_token_data_array * candidates,`	`1595`	`# llama_token_data_array * candidates,`
`@@ -1591,7 +1613,7 @@ def llama_sample_tail_free(`
`1591`	`_lib.llama_sample_tail_free.restype = None`	`1613`	`_lib.llama_sample_tail_free.restype = None`
`1592`		`1614`
`1593`		`1615`
`1594`	`-# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.`	`1616`	`+# /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.`
`1595`	`# LLAMA_API void llama_sample_typical(`	`1617`	`# LLAMA_API void llama_sample_typical(`
`1596`	`# struct llama_context * ctx,`	`1618`	`# struct llama_context * ctx,`
`1597`	`# llama_token_data_array * candidates,`	`1619`	`# llama_token_data_array * candidates,`
`@@ -1656,7 +1678,11 @@ def llama_sample_temperature(`
`1656`	`_lib.llama_sample_temperature.restype = None`	`1678`	`_lib.llama_sample_temperature.restype = None`
`1657`		`1679`
`1658`		`1680`
`1659`	`-# LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);`	`1681`	`+# /// @details Apply constraints from grammar`
		`1682`	`+# LLAMA_API void llama_sample_grammar(`
		`1683`	`+# struct llama_context * ctx,`
		`1684`	`+# llama_token_data_array * candidates,`
		`1685`	`+# const struct llama_grammar * grammar);`
`1660`	`def llama_sample_grammar(`	`1686`	`def llama_sample_grammar(`
`1661`	`ctx: llama_context_p,`	`1687`	`ctx: llama_context_p,`
`1662`	`candidates, # type: _Pointer[llama_token_data_array]`	`1688`	`candidates, # type: _Pointer[llama_token_data_array]`
`@@ -1673,12 +1699,12 @@ def llama_sample_grammar(`
`1673`	`_lib.llama_sample_grammar.restype = None`	`1699`	`_lib.llama_sample_grammar.restype = None`
`1674`		`1700`
`1675`		`1701`
`1676`	`-# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.`	`1702`	`+# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.`
`1677`	-# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.	`1703`	+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
`1678`	`-# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.`	`1704`	`+# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.`
`1679`	-# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.	`1705`	+# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
`1680`	-# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.	`1706`	+# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
`1681`	-# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.	`1707`	+# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
`1682`	`# LLAMA_API llama_token llama_sample_token_mirostat(`	`1708`	`# LLAMA_API llama_token llama_sample_token_mirostat(`
`1683`	`# struct llama_context * ctx,`	`1709`	`# struct llama_context * ctx,`
`1684`	`# llama_token_data_array * candidates,`	`1710`	`# llama_token_data_array * candidates,`
`@@ -1708,11 +1734,11 @@ def llama_sample_token_mirostat(`
`1708`	`_lib.llama_sample_token_mirostat.restype = llama_token`	`1734`	`_lib.llama_sample_token_mirostat.restype = llama_token`
`1709`		`1735`
`1710`		`1736`
`1711`	`-# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.`	`1737`	`+# /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.`
`1712`	-# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.	`1738`	+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
`1713`	`-# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.`	`1739`	`+# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.`
`1714`	-# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.	`1740`	+# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
`1715`	-# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.	`1741`	+# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
`1716`	`# LLAMA_API llama_token llama_sample_token_mirostat_v2(`	`1742`	`# LLAMA_API llama_token llama_sample_token_mirostat_v2(`
`1717`	`# struct llama_context * ctx,`	`1743`	`# struct llama_context * ctx,`
`1718`	`# llama_token_data_array * candidates,`	`1744`	`# llama_token_data_array * candidates,`
`@@ -1739,7 +1765,8 @@ def llama_sample_token_mirostat_v2(`
`1739`	`_lib.llama_sample_token_mirostat_v2.restype = llama_token`	`1765`	`_lib.llama_sample_token_mirostat_v2.restype = llama_token`
`1740`		`1766`
`1741`		`1767`
`1742`	`-# @details Selects the token with the highest probability.`	`1768`	`+# /// @details Selects the token with the highest probability.`
		`1769`	`+# /// Does not compute the token probabilities. Use llama_sample_softmax() instead.`
`1743`	`# LLAMA_API llama_token llama_sample_token_greedy(`	`1770`	`# LLAMA_API llama_token llama_sample_token_greedy(`
`1744`	`# struct llama_context * ctx,`	`1771`	`# struct llama_context * ctx,`
`1745`	`# llama_token_data_array * candidates);`	`1772`	`# llama_token_data_array * candidates);`
`@@ -1757,7 +1784,7 @@ def llama_sample_token_greedy(`
`1757`	`_lib.llama_sample_token_greedy.restype = llama_token`	`1784`	`_lib.llama_sample_token_greedy.restype = llama_token`
`1758`		`1785`
`1759`		`1786`
`1760`	`-# @details Randomly selects a token from the candidates based on their probabilities.`	`1787`	`+# /// @details Randomly selects a token from the candidates based on their probabilities.`
`1761`	`# LLAMA_API llama_token llama_sample_token(`	`1788`	`# LLAMA_API llama_token llama_sample_token(`
`1762`	`# struct llama_context * ctx,`	`1789`	`# struct llama_context * ctx,`
`1763`	`# llama_token_data_array * candidates);`	`1790`	`# llama_token_data_array * candidates);`