@@ -117,7 +117,7 @@ class LLM:
117
117
disable_async_output_proc: Disable async output processing.
118
118
This may result in lower performance.
119
119
hf_token: The token to use as HTTP bearer authorization for remote files
120
- . If `True`, will use the token generated when running
120
+ . If `True`, will use the token generated when running
121
121
`huggingface-cli login` (stored in `~/.huggingface`).
122
122
hf_overrides: If a dictionary, contains arguments to be forwarded to the
123
123
HuggingFace config. If a callable, it is called to update the
@@ -251,8 +251,12 @@ def __init__(
251
251
self .request_counter = Counter ()
252
252
self .default_sampling_params : Union [dict [str , Any ], None ] = None
253
253
254
- def get_tokenizer (self ) -> AnyTokenizer :
255
- return self .llm_engine .get_tokenizer_group ().tokenizer
254
+ def get_tokenizer (
255
+ self ,
256
+ lora_request : Optional [LoRARequest ] = None ,
257
+ ) -> AnyTokenizer :
258
+ return self .llm_engine .get_tokenizer_group ().get_lora_tokenizer (
259
+ lora_request )
256
260
257
261
def set_tokenizer (self , tokenizer : AnyTokenizer ) -> None :
258
262
tokenizer_group = self .llm_engine .get_tokenizer_group ()
@@ -712,7 +716,7 @@ def chat(
712
716
cast (list [ChatCompletionMessageParam ], messages )
713
717
]
714
718
715
- tokenizer = self .get_tokenizer ()
719
+ tokenizer = self .get_tokenizer (lora_request )
716
720
model_config = self .llm_engine .get_model_config ()
717
721
resolved_content_format = resolve_chat_template_content_format (
718
722
chat_template ,
@@ -735,9 +739,8 @@ def chat(
735
739
content_format = resolved_content_format ,
736
740
)
737
741
738
- prompt_data : Union [str , list [int ]]
739
742
if isinstance (tokenizer , MistralTokenizer ):
740
- prompt_data = apply_mistral_chat_template (
743
+ prompt_token_ids = apply_mistral_chat_template (
741
744
tokenizer ,
742
745
messages = msgs ,
743
746
chat_template = chat_template ,
@@ -746,7 +749,7 @@ def chat(
746
749
continue_final_message = continue_final_message ,
747
750
)
748
751
else :
749
- prompt_data = apply_hf_chat_template (
752
+ prompt_str = apply_hf_chat_template (
750
753
tokenizer ,
751
754
trust_remote_code = model_config .trust_remote_code ,
752
755
conversation = conversation ,
@@ -755,12 +758,12 @@ def chat(
755
758
add_generation_prompt = add_generation_prompt ,
756
759
continue_final_message = continue_final_message ,
757
760
)
761
+ # Special tokens are already included in chat templates so
762
+ # should not be added by the tokenizer in this case.
763
+ prompt_token_ids = tokenizer .encode (prompt_str ,
764
+ add_special_tokens = False )
758
765
759
- prompt : Union [TokensPrompt , TextPrompt ]
760
- if is_list_of (prompt_data , int ):
761
- prompt = TokensPrompt (prompt_token_ids = prompt_data )
762
- else :
763
- prompt = TextPrompt (prompt = prompt_data )
766
+ prompt = TokensPrompt (prompt_token_ids = prompt_token_ids )
764
767
765
768
if mm_data is not None :
766
769
prompt ["multi_modal_data" ] = mm_data
@@ -1059,8 +1062,6 @@ def _embedding_score(
1059
1062
if len (encoded_output_1 ) == 1 :
1060
1063
encoded_output_1 = encoded_output_1 * len (encoded_output_2 )
1061
1064
1062
- scores : list [PoolingRequestOutput ] = []
1063
-
1064
1065
scores = _cosine_similarity (tokenizer = tokenizer ,
1065
1066
embed_1 = encoded_output_1 ,
1066
1067
embed_2 = encoded_output_2 )
0 commit comments