Skip to content

Commit dd0d159

Browse files
author
jaime-m-p
committed
Fix special tokens rtrim
1 parent 5b61c04 commit dd0d159

File tree

1 file changed

+27
-4
lines changed

1 file changed

+27
-4
lines changed

llama.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4582,7 +4582,8 @@ static void llm_load_vocab(
45824582
(t.first == "<|eot_id|>" ||
45834583
t.first == "<|im_end|>" ||
45844584
t.first == "<|end|>" ||
4585-
t.first == "<end_of_turn>"
4585+
t.first == "<end_of_turn>" ||
4586+
t.first == "<|endoftext|>"
45864587
)
45874588
) {
45884589
vocab.special_eot_id = t.second;
@@ -12800,6 +12801,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1280012801
output.push_back(vocab.special_bos_id);
1280112802
}
1280212803

12804+
static const bool rtrim = true; //TODO: as param
12805+
bool is_prev_special = false;
12806+
bool special_token_rtrim = false;
12807+
1280312808
for (const auto & fragment : fragment_buffer) {
1280412809
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1280512810
// without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -12809,9 +12814,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1280912814
// and passing 'add space prefix' as bool argument
1281012815
//
1281112816
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12812-
if (&fragment == &fragment_buffer.front()) {
12813-
if (vocab.add_space_prefix) {
12814-
raw_text = " " + raw_text; // prefix with space if the first token is not special
12817+
12818+
if (special_token_rtrim) {
12819+
uint num_whitespaces = 0;
12820+
while (isspace(raw_text[num_whitespaces])) {
12821+
num_whitespaces++;
12822+
}
12823+
if(num_whitespaces == raw_text.size()) {
12824+
continue; // skip if all whitespaces
12825+
}
12826+
raw_text = raw_text.substr(num_whitespaces);
12827+
}
12828+
12829+
if(vocab.add_space_prefix) {
12830+
if (!output.size() || is_prev_special) { // prefix with space if first token
12831+
raw_text = " " + raw_text;
1281512832
}
1281612833
}
1281712834

@@ -12823,6 +12840,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
1282312840
tokenizer.tokenize(raw_text, output);
1282412841
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1282512842
output.push_back(fragment.token);
12843+
is_prev_special = true;
12844+
// phi-3 special tokens without rtrim, works fine for llama-spm too
12845+
special_token_rtrim = rtrim
12846+
&& fragment.token != vocab.special_bos_id
12847+
&& fragment.token != vocab.special_unk_id
12848+
&& fragment.token != vocab.special_eos_id;
1282612849
}
1282712850
}
1282812851

0 commit comments

Comments
 (0)