-
-
Notifications
You must be signed in to change notification settings - Fork 7.8k
Add vllm serve
to wrap vllm.entrypoints.openai.api_server
#4167
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat | |
|
||
You can start the server using Python, or using [Docker](deploying_with_docker.rst): | ||
```bash | ||
python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123 | ||
vllm serve mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123 | ||
``` | ||
|
||
To call the server, you can use the official OpenAI Python client library, or any other HTTP client. | ||
|
@@ -95,8 +95,7 @@ template, or the template in string form. Without a chat template, the server wi | |
and all chat requests will error. | ||
|
||
```bash | ||
python -m vllm.entrypoints.openai.api_server \ | ||
--model ... \ | ||
vllm serve ... \ | ||
--chat-template ./path-to-chat-template.jinja | ||
``` | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on #4709, the |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
|
||
import fastapi | ||
import uvicorn | ||
from fastapi import Request | ||
from fastapi import APIRouter, Request | ||
from fastapi.exceptions import RequestValidationError | ||
from fastapi.middleware.cors import CORSMiddleware | ||
from fastapi.responses import JSONResponse, Response, StreamingResponse | ||
|
@@ -26,6 +26,8 @@ | |
|
||
TIMEOUT_KEEP_ALIVE = 5 # seconds | ||
|
||
engine: AsyncLLMEngine = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shoudln't this be optional? |
||
engine_args: AsyncEngineArgs = None | ||
openai_serving_chat: OpenAIServingChat = None | ||
openai_serving_completion: OpenAIServingCompletion = None | ||
logger = init_logger(__name__) | ||
|
@@ -45,45 +47,33 @@ async def _force_log(): | |
yield | ||
|
||
|
||
app = fastapi.FastAPI(lifespan=lifespan) | ||
|
||
|
||
def parse_args(): | ||
parser = make_arg_parser() | ||
return parser.parse_args() | ||
|
||
router = APIRouter() | ||
|
||
# Add prometheus asgi middleware to route /metrics requests | ||
metrics_app = make_asgi_app() | ||
app.mount("/metrics", metrics_app) | ||
|
||
router.mount("/metrics", metrics_app) | ||
|
||
@app.exception_handler(RequestValidationError) | ||
async def validation_exception_handler(_, exc): | ||
err = openai_serving_chat.create_error_response(message=str(exc)) | ||
return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) | ||
|
||
|
||
@app.get("/health") | ||
@router.get("/health") | ||
async def health() -> Response: | ||
"""Health check.""" | ||
await openai_serving_chat.engine.check_health() | ||
return Response(status_code=200) | ||
|
||
|
||
@app.get("/v1/models") | ||
@router.get("/v1/models") | ||
async def show_available_models(): | ||
models = await openai_serving_chat.show_available_models() | ||
return JSONResponse(content=models.model_dump()) | ||
|
||
|
||
@app.get("/version") | ||
@router.get("/version") | ||
async def show_version(): | ||
ver = {"version": vllm.__version__} | ||
return JSONResponse(content=ver) | ||
|
||
|
||
@app.post("/v1/chat/completions") | ||
@router.post("/v1/chat/completions") | ||
async def create_chat_completion(request: ChatCompletionRequest, | ||
raw_request: Request): | ||
generator = await openai_serving_chat.create_chat_completion( | ||
|
@@ -98,7 +88,7 @@ async def create_chat_completion(request: ChatCompletionRequest, | |
return JSONResponse(content=generator.model_dump()) | ||
|
||
|
||
@app.post("/v1/completions") | ||
@router.post("/v1/completions") | ||
async def create_completion(request: CompletionRequest, raw_request: Request): | ||
generator = await openai_serving_completion.create_completion( | ||
request, raw_request) | ||
|
@@ -112,8 +102,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request): | |
return JSONResponse(content=generator.model_dump()) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
def build_app(args): | ||
app = fastapi.FastAPI(lifespan=lifespan) | ||
app.include_router(router) | ||
app.root_path = args.root_path | ||
|
||
app.add_middleware( | ||
CORSMiddleware, | ||
|
@@ -123,6 +115,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request): | |
allow_headers=args.allowed_headers, | ||
) | ||
|
||
@app.exception_handler(RequestValidationError) | ||
async def validation_exception_handler(_, exc): | ||
err = openai_serving_chat.create_error_response(message=str(exc)) | ||
return JSONResponse(err.model_dump(), | ||
status_code=HTTPStatus.BAD_REQUEST) | ||
|
||
if token := os.environ.get("VLLM_API_KEY") or args.api_key: | ||
|
||
@app.middleware("http") | ||
|
@@ -146,13 +144,21 @@ async def authentication(request: Request, call_next): | |
raise ValueError(f"Invalid middleware {middleware}. " | ||
f"Must be a function or a class.") | ||
|
||
return app | ||
|
||
|
||
def run_server(args): | ||
app = build_app(args) | ||
|
||
logger.info(f"vLLM API server version {vllm.__version__}") | ||
logger.info(f"args: {args}") | ||
|
||
if args.served_model_name is not None: | ||
served_model_names = args.served_model_name | ||
else: | ||
served_model_names = [args.model] | ||
|
||
global engine_args, engine, openai_serving_chat, openai_serving_completion | ||
engine_args = AsyncEngineArgs.from_cli_args(args) | ||
engine = AsyncLLMEngine.from_engine_args( | ||
engine_args, usage_context=UsageContext.OPENAI_API_SERVER) | ||
|
@@ -163,7 +169,6 @@ async def authentication(request: Request, call_next): | |
openai_serving_completion = OpenAIServingCompletion( | ||
engine, served_model_names, args.lora_modules) | ||
|
||
app.root_path = args.root_path | ||
uvicorn.run(app, | ||
host=args.host, | ||
port=args.port, | ||
|
@@ -173,3 +178,11 @@ async def authentication(request: Request, call_next): | |
ssl_certfile=args.ssl_certfile, | ||
ssl_ca_certs=args.ssl_ca_certs, | ||
ssl_cert_reqs=args.ssl_cert_reqs) | ||
|
||
|
||
if __name__ == "__main__": | ||
# NOTE(simon): | ||
# This section should be in sync with vllm/scripts.py for CLI entrypoints. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any way to add a simple regression test for this? |
||
parser = make_arg_parser() | ||
args = parser.parse_args() | ||
run_server(args) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# The CLI entrypoint to vLLM. | ||
import argparse | ||
|
||
from vllm.entrypoints.openai.api_server import run_server | ||
from vllm.entrypoints.openai.cli_args import make_arg_parser | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="vLLM CLI") | ||
subparsers = parser.add_subparsers() | ||
|
||
serve_parser = subparsers.add_parser( | ||
"serve", | ||
help="Start the vLLM OpenAI Compatible API server", | ||
usage="vllm serve <model_tag> [options]") | ||
make_arg_parser(serve_parser) | ||
# Override the `--model` optional argument, make it positional. | ||
serve_parser.add_argument("model", type=str, help="The model tag to serve") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's happening if vllm serve --model ? |
||
serve_parser.set_defaults(func=run_server) | ||
|
||
args = parser.parse_args() | ||
if hasattr(args, "func"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this part of code is confusing. Add a comment to explain what this does? |
||
args.func(args) | ||
else: | ||
parser.print_help() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Uh oh!
There was an error while loading. Please reload this page.