Skip to content

[Misc] format and refactor some examples #16252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion examples/offline_inference/mistral-small.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def run_simple_demo(args: argparse.Namespace):
},
]
outputs = llm.chat(messages, sampling_params=sampling_params)

print("-" * 50)
print(outputs[0].outputs[0].text)
print("-" * 50)


def run_advanced_demo(args: argparse.Namespace):
Expand Down Expand Up @@ -162,7 +163,9 @@ def run_advanced_demo(args: argparse.Namespace):
]

outputs = llm.chat(messages=messages, sampling_params=sampling_params)
print("-" * 50)
print(outputs[0].outputs[0].text)
print("-" * 50)


def main():
Expand Down
2 changes: 2 additions & 0 deletions examples/offline_inference/multilora_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine,
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0

print("-" * 50)
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params, lora_request = test_prompts.pop(0)
Expand All @@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine,
for request_output in request_outputs:
if request_output.finished:
print(request_output)
print("-" * 50)


def initialize_engine() -> LLMEngine:
Expand Down
57 changes: 33 additions & 24 deletions examples/offline_inference/neuron.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,36 @@
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=1024,
block_size=1024,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
tensor_parallel_size=2)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

def main():
# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=1024,
block_size=1024,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
tensor_parallel_size=2)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)


if __name__ == "__main__":
main()
65 changes: 37 additions & 28 deletions examples/offline_inference/neuron_int8_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,40 @@
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=2048,
block_size=2048,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
quantization="neuron_quant",
override_neuron_config={
"cast_logits_dtype": "bfloat16",
},
tensor_parallel_size=2)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

def main():
# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=2048,
block_size=2048,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
quantization="neuron_quant",
override_neuron_config={
"cast_logits_dtype": "bfloat16",
},
tensor_parallel_size=2)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)


if __name__ == "__main__":
main()
111 changes: 59 additions & 52 deletions examples/offline_inference/prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,55 +31,62 @@
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0)

# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)

print("Results without `enable_prefix_caching`")

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = regular_llm.generate(generating_prompts, sampling_params)

regular_generated_texts = []
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
regular_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

print("-" * 80)

# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()

# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)

# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)

# Generate with prefix caching.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)

print("Results with `enable_prefix_caching`")

cached_generated_texts = []
# Print the outputs. You should see the same outputs as before.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
cached_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

print("-" * 80)

# Compare the results and display the speedup
generated_same = all([
regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")

def main():
# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)

print("Results without `enable_prefix_caching`")

# ruff: noqa: E501
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = regular_llm.generate(generating_prompts, sampling_params)

regular_generated_texts = []
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
regular_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)

# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()

# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)

# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)

# Generate with prefix caching.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)

print("Results with `enable_prefix_caching`")

cached_generated_texts = []
# Print the outputs. You should see the same outputs as before.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
cached_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)

# Compare the results and display the speedup
generated_same = all([
regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")


if __name__ == "__main__":
main()
21 changes: 14 additions & 7 deletions examples/offline_inference/reproduciblity.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
# because it is almost impossible to make the scheduling deterministic in the
# online serving setting.

llm = LLM(model="facebook/opt-125m", seed=SEED)

prompts = [
"Hello, my name is",
"The president of the United States is",
Expand All @@ -29,8 +27,17 @@
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

def main():
llm = LLM(model="facebook/opt-125m", seed=SEED)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)


if __name__ == "__main__":
main()
8 changes: 6 additions & 2 deletions examples/offline_inference/rlhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,13 @@ def __init__(self, *args, **kwargs):

outputs = ray.get(llm.generate.remote(prompts, sampling_params))

print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, "
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)

# set up the communication between the training process
# and the inference engine.
Expand Down Expand Up @@ -120,8 +122,10 @@ def __init__(self, *args, **kwargs):
# use the updated model to generate texts, they will be nonsense
# because the weights are all zeros.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, "
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
4 changes: 3 additions & 1 deletion examples/offline_inference/simple_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
llm.stop_profile()

# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)

# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
Expand Down
4 changes: 3 additions & 1 deletion examples/offline_inference/torchrun_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@
outputs = llm.generate(prompts, sampling_params)

# all ranks will have the same outputs
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, "
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
"""
Further tips:

Expand Down
Loading