Skip to content

Commit 40f74e4

Browse files
authored
llama : add option to render special/control tokens (#6807)
* make : fix common dep on llama.h * llama : add option to render special tokens * readme : add API change notice ggml-ci * swift : fix build
1 parent b9cc76d commit 40f74e4

File tree

7 files changed

+25
-20
lines changed

7 files changed

+25
-20
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
699699
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
700700
$(CXX) $(CXXFLAGS) -c $< -o $@
701701

702-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
702+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
703703
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
704704

705705
common.o: common/common.cpp $(COMMON_H_DEPS)

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13+
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
1314
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
1415
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
1516
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017

common/common.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -2328,10 +2328,10 @@ std::vector<llama_token> llama_tokenize(
23282328

23292329
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
23302330
std::vector<char> result(8, 0);
2331-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
2331+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
23322332
if (n_tokens < 0) {
23332333
result.resize(-n_tokens);
2334-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
2334+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
23352335
GGML_ASSERT(check == -n_tokens);
23362336
} else {
23372337
result.resize(n_tokens);

examples/batched.swift/Sources/main.swift

+3-2
Original file line numberDiff line numberDiff line change
@@ -229,15 +229,16 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
229229

230230
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
231231
var result = [CChar](repeating: 0, count: 8)
232-
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
232+
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
233233
if nTokens < 0 {
234234
let actualTokensCount = -Int(nTokens)
235235
result = .init(repeating: 0, count: actualTokensCount)
236236
let check = llama_token_to_piece(
237237
model,
238238
token,
239239
&result,
240-
Int32(result.count)
240+
Int32(result.count),
241+
false
241242
)
242243
assert(check == actualTokensCount)
243244
} else {

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -322,15 +322,15 @@ actor LlamaContext {
322322
defer {
323323
result.deallocate()
324324
}
325-
let nTokens = llama_token_to_piece(model, token, result, 8)
325+
let nTokens = llama_token_to_piece(model, token, result, 8, false)
326326

327327
if nTokens < 0 {
328328
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
329329
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
330330
defer {
331331
newResult.deallocate()
332332
}
333-
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
333+
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
334334
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
335335
return Array(bufferPointer)
336336
} else {

llama.cpp

+13-12
Original file line numberDiff line numberDiff line change
@@ -1600,12 +1600,12 @@ struct llama_mlock {
16001600
};
16011601
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
16021602

1603-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1603+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
16041604
std::vector<char> result(8, 0);
1605-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1605+
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
16061606
if (n_tokens < 0) {
16071607
result.resize(-n_tokens);
1608-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
1608+
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
16091609
GGML_ASSERT(check == -n_tokens);
16101610
}
16111611
else {
@@ -13312,7 +13312,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
1331213312

1331313313
for (size_t i = 0; i < candidates->size; ++i) {
1331413314
const llama_token id = candidates->data[i].id;
13315-
const std::string piece = llama_token_to_piece(ctx, id);
13315+
const std::string piece = llama_token_to_piece(ctx, id, false);
13316+
1331613317
if (llama_token_is_eog(&ctx->model, id)) {
1331713318
if (!allow_eog) {
1331813319
candidates->data[i].logit = -INFINITY;
@@ -13512,7 +13513,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
1351213513
GGML_ASSERT(false);
1351313514
}
1351413515

13515-
const std::string piece = llama_token_to_piece(ctx, token);
13516+
const std::string piece = llama_token_to_piece(ctx, token, false);
1351613517

1351713518
// Note terminating 0 in decoded string
1351813519
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
@@ -16991,7 +16992,7 @@ static std::string llama_decode_text(const std::string & text) {
1699116992
}
1699216993

1699316994
// does not write null-terminator to buf
16994-
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
16995+
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
1699516996
if (0 <= token && token < llama_n_vocab(model)) {
1699616997
switch (llama_vocab_get_type(model->vocab)) {
1699716998
case LLAMA_VOCAB_TYPE_WPM:
@@ -17006,7 +17007,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
1700617007
}
1700717008
memcpy(buf, result.c_str(), result.length());
1700817009
return result.length();
17009-
} else if (llama_is_user_defined_token(model->vocab, token)) {
17010+
} else if (
17011+
(llama_is_user_defined_token(model->vocab, token)) ||
17012+
(llama_is_control_token (model->vocab, token) && special)) {
1701017013
std::string result = model->vocab.id_to_token[token].text;
1701117014
if (length < (int) result.length()) {
1701217015
return -(int) result.length();
@@ -17019,8 +17022,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
1701917022
}
1702017023
memcpy(buf, "\xe2\x96\x85", 3);
1702117024
return 3;
17022-
} else if (llama_is_control_token(model->vocab, token)) {
17023-
;
1702417025
} else if (llama_is_byte_token(model->vocab, token)) {
1702517026
if (length < 1) {
1702617027
return -1;
@@ -17041,15 +17042,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
1704117042
}
1704217043
memcpy(buf, result.c_str(), result.length());
1704317044
return result.length();
17044-
} else if (llama_is_user_defined_token(model->vocab, token)) {
17045+
} else if (
17046+
(llama_is_user_defined_token(model->vocab, token)) ||
17047+
(llama_is_control_token (model->vocab, token) && special)) {
1704517048
std::string result = model->vocab.id_to_token[token].text;
1704617049
if (length < (int) result.length()) {
1704717050
return -(int) result.length();
1704817051
}
1704917052
memcpy(buf, result.c_str(), result.length());
1705017053
return result.length();
17051-
} else if (llama_is_control_token(model->vocab, token)) {
17052-
;
1705317054
}
1705417055
break;
1705517056
}

llama.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -828,11 +828,13 @@ extern "C" {
828828
// Uses the vocabulary in the provided context.
829829
// Does not write null terminator to the buffer.
830830
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
831+
// @param special If true, special tokens are rendered in the output.
831832
LLAMA_API int32_t llama_token_to_piece(
832833
const struct llama_model * model,
833834
llama_token token,
834835
char * buf,
835-
int32_t length);
836+
int32_t length,
837+
bool special);
836838

837839
/// Apply chat template. Inspired by hf apply_chat_template() on python.
838840
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"

0 commit comments

Comments
 (0)