|
15 | 15 | "What is annapurna labs?",
|
16 | 16 | ]
|
17 | 17 |
|
18 |
| -# Create a sampling params object. |
19 |
| -sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) |
20 |
| - |
21 |
| -# Create an LLM. |
22 |
| -llm = LLM( |
23 |
| - model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", |
24 |
| - speculative_config={ |
25 |
| - "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", |
26 |
| - "num_speculative_tokens": 5, |
27 |
| - "max_model_len": 2048, |
28 |
| - }, |
29 |
| - max_num_seqs=4, |
30 |
| - # The max_model_len and block_size arguments are required to be same as |
31 |
| - # max sequence length when targeting neuron device. |
32 |
| - # Currently, this is a known limitation in continuous batching support |
33 |
| - # in neuronx-distributed-inference. |
34 |
| - max_model_len=2048, |
35 |
| - block_size=2048, |
36 |
| - # The device can be automatically detected when AWS Neuron SDK is installed. |
37 |
| - # The device argument can be either unspecified for automated detection, |
38 |
| - # or explicitly assigned. |
39 |
| - device="neuron", |
40 |
| - tensor_parallel_size=32, |
41 |
| - override_neuron_config={ |
42 |
| - "enable_eagle_speculation": True, |
43 |
| - "enable_fused_speculation": True, |
44 |
| - }, |
45 |
| -) |
46 |
| - |
47 |
| -# Generate texts from the prompts. The output is a list of RequestOutput objects |
48 |
| -# that contain the prompt, generated text, and other information. |
49 |
| -outputs = llm.generate(prompts, sampling_params) |
50 |
| -# Print the outputs. |
51 |
| -for output in outputs: |
52 |
| - prompt = output.prompt |
53 |
| - generated_text = output.outputs[0].text |
54 |
| - print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") |
| 18 | + |
| 19 | +def main(): |
| 20 | + # Create a sampling params object. |
| 21 | + sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True) |
| 22 | + |
| 23 | + # Create an LLM. |
| 24 | + llm = LLM( |
| 25 | + model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct", |
| 26 | + speculative_config={ |
| 27 | + "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft", |
| 28 | + "num_speculative_tokens": 5, |
| 29 | + "max_model_len": 2048, |
| 30 | + }, |
| 31 | + max_num_seqs=4, |
| 32 | + # The max_model_len and block_size arguments are required to be same as |
| 33 | + # max sequence length when targeting neuron device. |
| 34 | + # Currently, this is a known limitation in continuous batching support |
| 35 | + # in neuronx-distributed-inference. |
| 36 | + max_model_len=2048, |
| 37 | + block_size=2048, |
| 38 | + # The device can be automatically detected when AWS Neuron SDK is installed. |
| 39 | + # The device argument can be either unspecified for automated detection, |
| 40 | + # or explicitly assigned. |
| 41 | + device="neuron", |
| 42 | + tensor_parallel_size=32, |
| 43 | + override_neuron_config={ |
| 44 | + "enable_eagle_speculation": True, |
| 45 | + "enable_fused_speculation": True, |
| 46 | + }, |
| 47 | + ) |
| 48 | + |
| 49 | + # Generate texts from the prompts. The output is a list of RequestOutput objects |
| 50 | + # that contain the prompt, generated text, and other information. |
| 51 | + outputs = llm.generate(prompts, sampling_params) |
| 52 | + # Print the outputs. |
| 53 | + for output in outputs: |
| 54 | + prompt = output.prompt |
| 55 | + generated_text = output.outputs[0].text |
| 56 | + print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") |
| 57 | + |
| 58 | + |
| 59 | +if __name__ == "__main__": |
| 60 | + main() |
0 commit comments