|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +""" |
| 3 | +Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. |
| 4 | +See Ray Serve LLM documentation at: |
| 5 | +https://docs.ray.io/en/latest/serve/llm/serving-llms.html |
| 6 | +
|
| 7 | +Run `python3 ray_serve_deepseek.py` to deploy the model. |
| 8 | +""" |
| 9 | + |
| 10 | +from ray import serve |
| 11 | +from ray.serve.llm import LLMConfig, LLMRouter, LLMServer |
| 12 | + |
| 13 | +llm_config = LLMConfig( |
| 14 | + model_loading_config=dict( |
| 15 | + model_id="deepseek", |
| 16 | + # Change to model download path |
| 17 | + model_source="/path/to/the/model", |
| 18 | + ), |
| 19 | + deployment_config=dict(autoscaling_config=dict( |
| 20 | + min_replicas=1, |
| 21 | + max_replicas=1, |
| 22 | + )), |
| 23 | + # Change to the accelerator type of the node |
| 24 | + accelerator_type="H100", |
| 25 | + runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")), |
| 26 | + # Customize engine arguments as needed (e.g. vLLM engine kwargs) |
| 27 | + engine_kwargs=dict( |
| 28 | + tensor_parallel_size=8, |
| 29 | + pipeline_parallel_size=2, |
| 30 | + gpu_memory_utilization=0.92, |
| 31 | + dtype="auto", |
| 32 | + max_num_seqs=40, |
| 33 | + max_model_len=16384, |
| 34 | + enable_chunked_prefill=True, |
| 35 | + enable_prefix_caching=True, |
| 36 | + trust_remote_code=True, |
| 37 | + ), |
| 38 | +) |
| 39 | + |
| 40 | +# Deploy the application |
| 41 | +deployment = LLMServer.as_deployment( |
| 42 | + llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config) |
| 43 | +llm_app = LLMRouter.as_deployment().bind([deployment]) |
| 44 | +serve.run(llm_app) |
0 commit comments