Prepared the test to run vLLM with Llama-2 7B YaRN model

viktor-ferenczi · viktor-ferenczi · commit a7595e17be89 · 2023-09-30T09:54:39.000+02:00
diff --git a/tests/yarn/model_llama2_7b_vllm.py b/tests/yarn/model_llama2_7b_vllm.py
@@ -5,15 +5,16 @@
 from vllm import LLM, SamplingParams
 from verification_prompt import PROMPT
 
-MODEL_ID = 'Llama2/Llama-2-7B-fp16'
+# MODEL_ID = 'Llama2/Llama-2-7B-fp16'
+MODEL_ID = 'NousResearch/Yarn-Llama-2-7b-64k'
 MODEL_DIR = os.path.expanduser(f'~/models/{MODEL_ID}')
 
 
 class Model(BaseModel):
     def __init__(self):
         super().__init__()
         self.model_id = MODEL_ID
-        self.llm = LLM(model=MODEL_DIR,
+        self.llm = LLM(model=MODEL_DIR,  # Use MODEL_ID here to download the model using HF
                        # tokenizer='hf-internal-testing/llama-tokenizer',
                        tensor_parallel_size=2,
                        swap_space=8,
diff --git a/tests/yarn/model_llama2_7b_yarn.py b/tests/yarn/model_llama2_7b_yarn.py
@@ -17,16 +17,14 @@ def __init__(self):
         self.model_id = MODEL_ID
         self.pipeline = transformers.pipeline(
             "text-generation",
-            model=MODEL_DIR,
+            model=MODEL_DIR,  # Use MODEL_ID here to download the model using HF
             torch_dtype=torch.bfloat16,
             device_map="auto",
             trust_remote_code=True,
         )
 
     @property
     def max_context_size(self) -> int:
-        # FIXME: If you run out of VRAM, then limit the context size here
-        # return 8192
         return self.pipeline.model.base_model.config.max_position_embeddings
 
     def generate(self, prompt: str, *, n: int, max_new_tokens: int) -> List[str]:
diff --git a/tests/yarn/pass_key_evaluator.py b/tests/yarn/pass_key_evaluator.py
@@ -1,5 +1,5 @@
 import random
-from typing import Tuple, Iterable
+from typing import Tuple, Iterable, Optional
 
 from base_model import BaseModel
 
@@ -56,23 +56,32 @@ def evaluate(self, max_tokens: int, resolution: int = 100, n: int = 10) -> Itera
             yield key_position, prefix_token_count, success_count
 
 
-def evaluate_vllm():
-    from model_llama2_7b_vllm import Model
-    # from model_llama2_7b_yarn import Model
-
-    model = Model()
-
+def evaluate_vllm(model: BaseModel, context_size_limit: Optional[int] = None):
     context_size = model.max_context_size
+    if context_size_limit is not None:
+        context_size = context_size_limit
+
     print(f'Model: {model.model_id}')
     print(f'Model context size: {context_size}')
 
     evaluator = PassKeyEvaluator(model)
-    for result in evaluator.evaluate(context_size, 100, 3):
+    for result in evaluator.evaluate(context_size, 100, 2):
         print(result)
 
 
 def main():
-    evaluate_vllm()
+    # Select the model to test here
+    from model_llama2_7b_vllm import Model
+    # from model_llama2_7b_yarn import Model
+    model = Model()
+
+    # If you run out of VRAM, then pass a smaller context size here
+
+    # Limited to 8k
+    evaluate_vllm(model, 8192)
+
+    # Unlimited
+    # evaluate_vllm(model)
 
 
 if __name__ == '__main__':