Skip to content

Commit 2dc8170

Browse files
reidliu41reidliu41
authored andcommitted
[Misc] format and refactor some examples (vllm-project#16252)
Signed-off-by: reidliu41 <[email protected]> Co-authored-by: reidliu41 <[email protected]>
1 parent 62b91aa commit 2dc8170

13 files changed

+190
-127
lines changed

examples/offline_inference/mistral-small.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,9 @@ def run_simple_demo(args: argparse.Namespace):
9090
},
9191
]
9292
outputs = llm.chat(messages, sampling_params=sampling_params)
93-
93+
print("-" * 50)
9494
print(outputs[0].outputs[0].text)
95+
print("-" * 50)
9596

9697

9798
def run_advanced_demo(args: argparse.Namespace):
@@ -162,7 +163,9 @@ def run_advanced_demo(args: argparse.Namespace):
162163
]
163164

164165
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
166+
print("-" * 50)
165167
print(outputs[0].outputs[0].text)
168+
print("-" * 50)
166169

167170

168171
def main():

examples/offline_inference/multilora_inference.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine,
6161
"""Continuously process a list of prompts and handle the outputs."""
6262
request_id = 0
6363

64+
print("-" * 50)
6465
while test_prompts or engine.has_unfinished_requests():
6566
if test_prompts:
6667
prompt, sampling_params, lora_request = test_prompts.pop(0)
@@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine,
7576
for request_output in request_outputs:
7677
if request_output.finished:
7778
print(request_output)
79+
print("-" * 50)
7880

7981

8082
def initialize_engine() -> LLMEngine:

examples/offline_inference/neuron.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,36 @@
1212
# Create a sampling params object.
1313
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
1414

15-
# Create an LLM.
16-
llm = LLM(
17-
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
18-
max_num_seqs=8,
19-
# The max_model_len and block_size arguments are required to be same as
20-
# max sequence length when targeting neuron device.
21-
# Currently, this is a known limitation in continuous batching support
22-
# in transformers-neuronx.
23-
# TODO(liangfu): Support paged-attention in transformers-neuronx.
24-
max_model_len=1024,
25-
block_size=1024,
26-
# The device can be automatically detected when AWS Neuron SDK is installed.
27-
# The device argument can be either unspecified for automated detection,
28-
# or explicitly assigned.
29-
device="neuron",
30-
tensor_parallel_size=2)
31-
# Generate texts from the prompts. The output is a list of RequestOutput objects
32-
# that contain the prompt, generated text, and other information.
33-
outputs = llm.generate(prompts, sampling_params)
34-
# Print the outputs.
35-
for output in outputs:
36-
prompt = output.prompt
37-
generated_text = output.outputs[0].text
38-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
15+
16+
def main():
17+
# Create an LLM.
18+
llm = LLM(
19+
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
20+
max_num_seqs=8,
21+
# The max_model_len and block_size arguments are required to be same as
22+
# max sequence length when targeting neuron device.
23+
# Currently, this is a known limitation in continuous batching support
24+
# in transformers-neuronx.
25+
# TODO(liangfu): Support paged-attention in transformers-neuronx.
26+
max_model_len=1024,
27+
block_size=1024,
28+
# ruff: noqa: E501
29+
# The device can be automatically detected when AWS Neuron SDK is installed.
30+
# The device argument can be either unspecified for automated detection,
31+
# or explicitly assigned.
32+
device="neuron",
33+
tensor_parallel_size=2)
34+
# Generate texts from the prompts. The output is a list of RequestOutput objects
35+
# that contain the prompt, generated text, and other information.
36+
outputs = llm.generate(prompts, sampling_params)
37+
# Print the outputs.
38+
print("-" * 50)
39+
for output in outputs:
40+
prompt = output.prompt
41+
generated_text = output.outputs[0].text
42+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
43+
print("-" * 50)
44+
45+
46+
if __name__ == "__main__":
47+
main()

examples/offline_inference/neuron_int8_quantization.py

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,31 +22,40 @@
2222
# Create a sampling params object.
2323
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
2424

25-
# Create an LLM.
26-
llm = LLM(
27-
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
28-
max_num_seqs=8,
29-
# The max_model_len and block_size arguments are required to be same as
30-
# max sequence length when targeting neuron device.
31-
# Currently, this is a known limitation in continuous batching support
32-
# in transformers-neuronx.
33-
# TODO(liangfu): Support paged-attention in transformers-neuronx.
34-
max_model_len=2048,
35-
block_size=2048,
36-
# The device can be automatically detected when AWS Neuron SDK is installed.
37-
# The device argument can be either unspecified for automated detection,
38-
# or explicitly assigned.
39-
device="neuron",
40-
quantization="neuron_quant",
41-
override_neuron_config={
42-
"cast_logits_dtype": "bfloat16",
43-
},
44-
tensor_parallel_size=2)
45-
# Generate texts from the prompts. The output is a list of RequestOutput objects
46-
# that contain the prompt, generated text, and other information.
47-
outputs = llm.generate(prompts, sampling_params)
48-
# Print the outputs.
49-
for output in outputs:
50-
prompt = output.prompt
51-
generated_text = output.outputs[0].text
52-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
25+
26+
def main():
27+
# Create an LLM.
28+
llm = LLM(
29+
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
30+
max_num_seqs=8,
31+
# The max_model_len and block_size arguments are required to be same as
32+
# max sequence length when targeting neuron device.
33+
# Currently, this is a known limitation in continuous batching support
34+
# in transformers-neuronx.
35+
# TODO(liangfu): Support paged-attention in transformers-neuronx.
36+
max_model_len=2048,
37+
block_size=2048,
38+
# ruff: noqa: E501
39+
# The device can be automatically detected when AWS Neuron SDK is installed.
40+
# The device argument can be either unspecified for automated detection,
41+
# or explicitly assigned.
42+
device="neuron",
43+
quantization="neuron_quant",
44+
override_neuron_config={
45+
"cast_logits_dtype": "bfloat16",
46+
},
47+
tensor_parallel_size=2)
48+
# Generate texts from the prompts. The output is a list of RequestOutput objects
49+
# that contain the prompt, generated text, and other information.
50+
outputs = llm.generate(prompts, sampling_params)
51+
# Print the outputs.
52+
print("-" * 50)
53+
for output in outputs:
54+
prompt = output.prompt
55+
generated_text = output.outputs[0].text
56+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
57+
print("-" * 50)
58+
59+
60+
if __name__ == "__main__":
61+
main()

examples/offline_inference/prefix_caching.py

Lines changed: 59 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -31,55 +31,62 @@
3131
# Create a sampling params object.
3232
sampling_params = SamplingParams(temperature=0.0)
3333

34-
# Create an LLM without prefix caching as a baseline.
35-
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
36-
37-
print("Results without `enable_prefix_caching`")
38-
39-
# Generate texts from the prompts. The output is a list of RequestOutput objects
40-
# that contain the prompt, generated text, and other information.
41-
outputs = regular_llm.generate(generating_prompts, sampling_params)
42-
43-
regular_generated_texts = []
44-
# Print the outputs.
45-
for output in outputs:
46-
prompt = output.prompt
47-
generated_text = output.outputs[0].text
48-
regular_generated_texts.append(generated_text)
49-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
50-
51-
print("-" * 80)
52-
53-
# Destroy the LLM object and free up the GPU memory.
54-
del regular_llm
55-
cleanup_dist_env_and_memory()
56-
57-
# Create an LLM with prefix caching enabled.
58-
prefix_cached_llm = LLM(model="facebook/opt-125m",
59-
enable_prefix_caching=True,
60-
gpu_memory_utilization=0.4)
61-
62-
# Warmup so that the shared prompt's KV cache is computed.
63-
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
64-
65-
# Generate with prefix caching.
66-
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
67-
68-
print("Results with `enable_prefix_caching`")
69-
70-
cached_generated_texts = []
71-
# Print the outputs. You should see the same outputs as before.
72-
for output in outputs:
73-
prompt = output.prompt
74-
generated_text = output.outputs[0].text
75-
cached_generated_texts.append(generated_text)
76-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
77-
78-
print("-" * 80)
79-
80-
# Compare the results and display the speedup
81-
generated_same = all([
82-
regular_generated_texts[i] == cached_generated_texts[i]
83-
for i in range(len(prompts))
84-
])
85-
print(f"Generated answers are the same: {generated_same}")
34+
35+
def main():
36+
# Create an LLM without prefix caching as a baseline.
37+
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
38+
39+
print("Results without `enable_prefix_caching`")
40+
41+
# ruff: noqa: E501
42+
# Generate texts from the prompts. The output is a list of RequestOutput objects
43+
# that contain the prompt, generated text, and other information.
44+
outputs = regular_llm.generate(generating_prompts, sampling_params)
45+
46+
regular_generated_texts = []
47+
# Print the outputs.
48+
print("-" * 50)
49+
for output in outputs:
50+
prompt = output.prompt
51+
generated_text = output.outputs[0].text
52+
regular_generated_texts.append(generated_text)
53+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
54+
print("-" * 50)
55+
56+
# Destroy the LLM object and free up the GPU memory.
57+
del regular_llm
58+
cleanup_dist_env_and_memory()
59+
60+
# Create an LLM with prefix caching enabled.
61+
prefix_cached_llm = LLM(model="facebook/opt-125m",
62+
enable_prefix_caching=True,
63+
gpu_memory_utilization=0.4)
64+
65+
# Warmup so that the shared prompt's KV cache is computed.
66+
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
67+
68+
# Generate with prefix caching.
69+
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
70+
71+
print("Results with `enable_prefix_caching`")
72+
73+
cached_generated_texts = []
74+
# Print the outputs. You should see the same outputs as before.
75+
print("-" * 50)
76+
for output in outputs:
77+
prompt = output.prompt
78+
generated_text = output.outputs[0].text
79+
cached_generated_texts.append(generated_text)
80+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
81+
print("-" * 50)
82+
83+
# Compare the results and display the speedup
84+
generated_same = all([
85+
regular_generated_texts[i] == cached_generated_texts[i]
86+
for i in range(len(prompts))
87+
])
88+
print(f"Generated answers are the same: {generated_same}")
89+
90+
91+
if __name__ == "__main__":
92+
main()

examples/offline_inference/reproduciblity.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
# because it is almost impossible to make the scheduling deterministic in the
2020
# online serving setting.
2121

22-
llm = LLM(model="facebook/opt-125m", seed=SEED)
23-
2422
prompts = [
2523
"Hello, my name is",
2624
"The president of the United States is",
@@ -29,8 +27,17 @@
2927
]
3028
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
3129

32-
outputs = llm.generate(prompts, sampling_params)
33-
for output in outputs:
34-
prompt = output.prompt
35-
generated_text = output.outputs[0].text
36-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
30+
31+
def main():
32+
llm = LLM(model="facebook/opt-125m", seed=SEED)
33+
outputs = llm.generate(prompts, sampling_params)
34+
print("-" * 50)
35+
for output in outputs:
36+
prompt = output.prompt
37+
generated_text = output.outputs[0].text
38+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
39+
print("-" * 50)
40+
41+
42+
if __name__ == "__main__":
43+
main()

examples/offline_inference/rlhf.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,13 @@ def __init__(self, *args, **kwargs):
8585

8686
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
8787

88+
print("-" * 50)
8889
for output in outputs:
8990
prompt = output.prompt
9091
generated_text = output.outputs[0].text
91-
print(f"Prompt: {prompt!r}, "
92+
print(f"Prompt: {prompt!r}\n"
9293
f"Generated text: {generated_text!r}")
94+
print("-" * 50)
9395

9496
# set up the communication between the training process
9597
# and the inference engine.
@@ -120,8 +122,10 @@ def __init__(self, *args, **kwargs):
120122
# use the updated model to generate texts, they will be nonsense
121123
# because the weights are all zeros.
122124
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
125+
print("-" * 50)
123126
for output in outputs_updated:
124127
prompt = output.prompt
125128
generated_text = output.outputs[0].text
126-
print(f"Prompt: {prompt!r}, "
129+
print(f"Prompt: {prompt!r}\n"
127130
f"Generated text: {generated_text!r}")
131+
print("-" * 50)

examples/offline_inference/simple_profiling.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,12 @@
3232
llm.stop_profile()
3333

3434
# Print the outputs.
35+
print("-" * 50)
3536
for output in outputs:
3637
prompt = output.prompt
3738
generated_text = output.outputs[0].text
38-
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
39+
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
40+
print("-" * 50)
3941

4042
# Add a buffer to wait for profiler in the background process
4143
# (in case MP is on) to finish writing profiling output.

examples/offline_inference/torchrun_example.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,13 @@
3636
outputs = llm.generate(prompts, sampling_params)
3737

3838
# all ranks will have the same outputs
39+
print("-" * 50)
3940
for output in outputs:
4041
prompt = output.prompt
4142
generated_text = output.outputs[0].text
42-
print(f"Prompt: {prompt!r}, "
43+
print(f"Prompt: {prompt!r}\n"
4344
f"Generated text: {generated_text!r}")
45+
print("-" * 50)
4446
"""
4547
Further tips:
4648

0 commit comments

Comments
 (0)