Skip to content

Commit d808fd4

Browse files
committed
Update llama.cpp
1 parent 53861c9 commit d808fd4

File tree

2 files changed

+63
-36
lines changed

2 files changed

+63
-36
lines changed

llama_cpp/llama_cpp.py

+62-35
Original file line numberOriginal file lineDiff line numberDiff line change
@@ -243,6 +243,7 @@ class llama_token_data_array(Structure):
243
# llama_token * token;
243
# llama_token * token;
244
# float * embd;
244
# float * embd;
245
# llama_pos * pos;
245
# llama_pos * pos;
246+
# int32_t * n_seq_id;
246
# llama_seq_id ** seq_id;
247
# llama_seq_id ** seq_id;
247
# int8_t * logits;
248
# int8_t * logits;
248

249

@@ -262,6 +263,7 @@ class llama_batch(Structure):
262
("token", POINTER(llama_token)),
263
("token", POINTER(llama_token)),
263
("embd", c_float_p),
264
("embd", c_float_p),
264
("pos", POINTER(llama_pos)),
265
("pos", POINTER(llama_pos)),
266+
("n_seq_id", POINTER(c_int32)),
265
("seq_id", POINTER(POINTER(llama_seq_id))),
267
("seq_id", POINTER(POINTER(llama_seq_id))),
266
("logits", POINTER(c_int8)),
268
("logits", POINTER(c_int8)),
267
("all_pos_0", llama_pos),
269
("all_pos_0", llama_pos),
@@ -312,7 +314,7 @@ class llama_model_params(Structure):
312

314

313

315

314
# // Keep the booleans together to avoid misalignment during copy-by-value.
316
# // Keep the booleans together to avoid misalignment during copy-by-value.
315-
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels
317+
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
316
# bool f16_kv; // use fp16 for KV cache, fp32 otherwise
318
# bool f16_kv; // use fp16 for KV cache, fp32 otherwise
317
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
319
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
318
# bool embedding; // embedding mode only
320
# bool embedding; // embedding mode only
@@ -349,6 +351,7 @@ class llama_context_params(Structure):
349
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
351
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
350
# bool quantize_output_tensor; // quantize output.weight
352
# bool quantize_output_tensor; // quantize output.weight
351
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
353
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
354+
# bool pure; // disable k-quant mixtures and quantize all tensors to the same type
352
# } llama_model_quantize_params;
355
# } llama_model_quantize_params;
353
class llama_model_quantize_params(Structure):
356
class llama_model_quantize_params(Structure):
354
_fields_ = [
357
_fields_ = [
@@ -777,26 +780,21 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
777
_lib.llama_get_kv_cache_token_count.restype = c_int
780
_lib.llama_get_kv_cache_token_count.restype = c_int
778

781

779

782

780-
# // Remove all tokens data of cells in [c0, c1)
783+
# // Clear the KV cache
781-
# // c0 < 0 : [0, c1]
784+
# LLAMA_API void llama_kv_cache_clear(
782-
# // c1 < 0 : [c0, inf)
785+
# struct llama_context * ctx);
783-
# LLAMA_API void llama_kv_cache_tokens_rm(
786+
def llama_kv_cache_clear(ctx: llama_context_p):
784-
# struct llama_context * ctx,
787+
return _lib.llama_kv_cache_clear(ctx)
785-
# int32_t c0,
786-
# int32_t c1);
787-
def llama_kv_cache_tokens_rm(
788-
ctx: llama_context_p, c0: Union[c_int32, int], c1: Union[c_int32, int]
789-
):
790-
return _lib.llama_kv_cache_tokens_rm(ctx, c0, c1)
791

788

792

789

793-
_lib.llama_kv_cache_tokens_rm.argtypes = [llama_context_p, c_int32, c_int32]
790+
_lib.llama_kv_cache_clear.argtypes = [llama_context_p]
794-
_lib.llama_kv_cache_tokens_rm.restype = None
791+
_lib.llama_kv_cache_clear.restype = None
795

792

796

793

797
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
794
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
798-
# // p0 < 0 : [0, p1]
795+
# // seq_id < 0 : match any sequence
799-
# // p1 < 0 : [p0, inf)
796+
# // p0 < 0 : [0, p1]
797+
# // p1 < 0 : [p0, inf)
800
# LLAMA_API void llama_kv_cache_seq_rm(
798
# LLAMA_API void llama_kv_cache_seq_rm(
801
# struct llama_context * ctx,
799
# struct llama_context * ctx,
802
# llama_seq_id seq_id,
800
# llama_seq_id seq_id,
@@ -1502,7 +1500,7 @@ def llama_sample_classifier_free_guidance(
1502
_lib.llama_sample_classifier_free_guidance.restype = None
1500
_lib.llama_sample_classifier_free_guidance.restype = None
1503

1501

1504

1502

1505-
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1503+
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1506
# LLAMA_API void llama_sample_softmax(
1504
# LLAMA_API void llama_sample_softmax(
1507
# struct llama_context * ctx,
1505
# struct llama_context * ctx,
1508
# llama_token_data_array * candidates);
1506
# llama_token_data_array * candidates);
@@ -1519,7 +1517,7 @@ def llama_sample_softmax(
1519
_lib.llama_sample_softmax.restype = None
1517
_lib.llama_sample_softmax.restype = None
1520

1518

1521

1519

1522-
# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1520+
# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1523
# LLAMA_API void llama_sample_top_k(
1521
# LLAMA_API void llama_sample_top_k(
1524
# struct llama_context * ctx,
1522
# struct llama_context * ctx,
1525
# llama_token_data_array * candidates,
1523
# llama_token_data_array * candidates,
@@ -1543,7 +1541,7 @@ def llama_sample_top_k(
1543
_lib.llama_sample_top_k.restype = None
1541
_lib.llama_sample_top_k.restype = None
1544

1542

1545

1543

1546-
# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1544+
# /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1547
# LLAMA_API void llama_sample_top_p(
1545
# LLAMA_API void llama_sample_top_p(
1548
# struct llama_context * ctx,
1546
# struct llama_context * ctx,
1549
# llama_token_data_array * candidates,
1547
# llama_token_data_array * candidates,
@@ -1567,7 +1565,31 @@ def llama_sample_top_p(
1567
_lib.llama_sample_top_p.restype = None
1565
_lib.llama_sample_top_p.restype = None
1568

1566

1569

1567

1570-
# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1568+
# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1569+
# LLAMA_API void llama_sample_min_p(
1570+
# struct llama_context * ctx,
1571+
# llama_token_data_array * candidates,
1572+
# float p,
1573+
# size_t min_keep);
1574+
def llama_sample_min_p(
1575+
ctx: llama_context_p,
1576+
candidates, # type: _Pointer[llama_token_data_array]
1577+
p: Union[c_float, float],
1578+
min_keep: Union[c_size_t, int],
1579+
):
1580+
return _lib.llama_sample_min_p(ctx, candidates, p, min_keep)
1581+
1582+
1583+
_lib.llama_sample_min_p.argtypes = [
1584+
llama_context_p,
1585+
llama_token_data_array_p,
1586+
c_float,
1587+
c_size_t,
1588+
]
1589+
_lib.llama_sample_min_p.restype = None
1590+
1591+
1592+
# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1571
# LLAMA_API void llama_sample_tail_free(
1593
# LLAMA_API void llama_sample_tail_free(
1572
# struct llama_context * ctx,
1594
# struct llama_context * ctx,
1573
# llama_token_data_array * candidates,
1595
# llama_token_data_array * candidates,
@@ -1591,7 +1613,7 @@ def llama_sample_tail_free(
1591
_lib.llama_sample_tail_free.restype = None
1613
_lib.llama_sample_tail_free.restype = None
1592

1614

1593

1615

1594-
# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1616+
# /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1595
# LLAMA_API void llama_sample_typical(
1617
# LLAMA_API void llama_sample_typical(
1596
# struct llama_context * ctx,
1618
# struct llama_context * ctx,
1597
# llama_token_data_array * candidates,
1619
# llama_token_data_array * candidates,
@@ -1656,7 +1678,11 @@ def llama_sample_temperature(
1656
_lib.llama_sample_temperature.restype = None
1678
_lib.llama_sample_temperature.restype = None
1657

1679

1658

1680

1659-
# LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
1681+
# /// @details Apply constraints from grammar
1682+
# LLAMA_API void llama_sample_grammar(
1683+
# struct llama_context * ctx,
1684+
# llama_token_data_array * candidates,
1685+
# const struct llama_grammar * grammar);
1660
def llama_sample_grammar(
1686
def llama_sample_grammar(
1661
ctx: llama_context_p,
1687
ctx: llama_context_p,
1662
candidates, # type: _Pointer[llama_token_data_array]
1688
candidates, # type: _Pointer[llama_token_data_array]
@@ -1673,12 +1699,12 @@ def llama_sample_grammar(
1673
_lib.llama_sample_grammar.restype = None
1699
_lib.llama_sample_grammar.restype = None
1674

1700

1675

1701

1676-
# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1702+
# /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1677-
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1703+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1678-
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1704+
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1679-
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1705+
# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1680-
# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1706+
# /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1681-
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1707+
# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1682
# LLAMA_API llama_token llama_sample_token_mirostat(
1708
# LLAMA_API llama_token llama_sample_token_mirostat(
1683
# struct llama_context * ctx,
1709
# struct llama_context * ctx,
1684
# llama_token_data_array * candidates,
1710
# llama_token_data_array * candidates,
@@ -1708,11 +1734,11 @@ def llama_sample_token_mirostat(
1708
_lib.llama_sample_token_mirostat.restype = llama_token
1734
_lib.llama_sample_token_mirostat.restype = llama_token
1709

1735

1710

1736

1711-
# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1737+
# /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1712-
# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1738+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1713-
# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1739+
# /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1714-
# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1740+
# /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1715-
# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1741+
# /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1716
# LLAMA_API llama_token llama_sample_token_mirostat_v2(
1742
# LLAMA_API llama_token llama_sample_token_mirostat_v2(
1717
# struct llama_context * ctx,
1743
# struct llama_context * ctx,
1718
# llama_token_data_array * candidates,
1744
# llama_token_data_array * candidates,
@@ -1739,7 +1765,8 @@ def llama_sample_token_mirostat_v2(
1739
_lib.llama_sample_token_mirostat_v2.restype = llama_token
1765
_lib.llama_sample_token_mirostat_v2.restype = llama_token
1740

1766

1741

1767

1742-
# @details Selects the token with the highest probability.
1768+
# /// @details Selects the token with the highest probability.
1769+
# /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
1743
# LLAMA_API llama_token llama_sample_token_greedy(
1770
# LLAMA_API llama_token llama_sample_token_greedy(
1744
# struct llama_context * ctx,
1771
# struct llama_context * ctx,
1745
# llama_token_data_array * candidates);
1772
# llama_token_data_array * candidates);
@@ -1757,7 +1784,7 @@ def llama_sample_token_greedy(
1757
_lib.llama_sample_token_greedy.restype = llama_token
1784
_lib.llama_sample_token_greedy.restype = llama_token
1758

1785

1759

1786

1760-
# @details Randomly selects a token from the candidates based on their probabilities.
1787+
# /// @details Randomly selects a token from the candidates based on their probabilities.
1761
# LLAMA_API llama_token llama_sample_token(
1788
# LLAMA_API llama_token llama_sample_token(
1762
# struct llama_context * ctx,
1789
# struct llama_context * ctx,
1763
# llama_token_data_array * candidates);
1790
# llama_token_data_array * candidates);

vendor/llama.cpp

0 commit comments

Comments
 (0)