@@ -4582,7 +4582,8 @@ static void llm_load_vocab(
4582
4582
(t.first == "<|eot_id|>" ||
4583
4583
t.first == "<|im_end|>" ||
4584
4584
t.first == "<|end|>" ||
4585
- t.first == "<end_of_turn>"
4585
+ t.first == "<end_of_turn>" ||
4586
+ t.first == "<|endoftext|>"
4586
4587
)
4587
4588
) {
4588
4589
vocab.special_eot_id = t.second;
@@ -12800,6 +12801,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12800
12801
output.push_back(vocab.special_bos_id);
12801
12802
}
12802
12803
12804
+ static const bool rtrim = true; //TODO: as param
12805
+ bool is_prev_special = false;
12806
+ bool special_token_rtrim = false;
12807
+
12803
12808
for (const auto & fragment : fragment_buffer) {
12804
12809
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
12805
12810
// without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -12809,9 +12814,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12809
12814
// and passing 'add space prefix' as bool argument
12810
12815
//
12811
12816
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
12812
- if (&fragment == &fragment_buffer.front()) {
12813
- if (vocab.add_space_prefix) {
12814
- raw_text = " " + raw_text; // prefix with space if the first token is not special
12817
+
12818
+ if (special_token_rtrim) {
12819
+ uint num_whitespaces = 0;
12820
+ while (isspace(raw_text[num_whitespaces])) {
12821
+ num_whitespaces++;
12822
+ }
12823
+ if(num_whitespaces == raw_text.size()) {
12824
+ continue; // skip if all whitespaces
12825
+ }
12826
+ raw_text = raw_text.substr(num_whitespaces);
12827
+ }
12828
+
12829
+ if(vocab.add_space_prefix) {
12830
+ if (!output.size() || is_prev_special) { // prefix with space if first token
12831
+ raw_text = " " + raw_text;
12815
12832
}
12816
12833
}
12817
12834
@@ -12823,6 +12840,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
12823
12840
tokenizer.tokenize(raw_text, output);
12824
12841
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
12825
12842
output.push_back(fragment.token);
12843
+ is_prev_special = true;
12844
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
12845
+ special_token_rtrim = rtrim
12846
+ && fragment.token != vocab.special_bos_id
12847
+ && fragment.token != vocab.special_unk_id
12848
+ && fragment.token != vocab.special_eos_id;
12826
12849
}
12827
12850
}
12828
12851
0 commit comments