Skip to content

Commit

Permalink
Merge pull request #263 from QwertyJack/main
Browse files Browse the repository at this point in the history
Add vLLM's health and metrics endpoints
  • Loading branch information
jeffreymeetkai committed Sep 4, 2024
2 parents f76fd97 + 26643ed commit 2ade257
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions server_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@

import fastapi
import uvicorn
import vllm.entrypoints.openai.api_server as vllm_api_server
from fastapi import Request
from fastapi.middleware.cors import CORSMiddleware
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import health, mount_metrics
from vllm.entrypoints.openai.protocol import ModelCard, ModelList, ModelPermission
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import get_tokenizer
Expand All @@ -46,6 +48,12 @@
app = fastapi.FastAPI()


@app.get("/health")
async def _health():
"""Health check."""
return await health()


@app.get("/v1/models")
async def show_available_models():
"""Show available models."""
Expand Down Expand Up @@ -128,6 +136,8 @@ async def create_chat_completion(raw_request: Request):
else:
from vllm.engine.async_llm_engine import AsyncLLMEngine

mount_metrics(app)

app.add_middleware(
CORSMiddleware,
allow_origins=args.allowed_origins,
Expand All @@ -154,6 +164,9 @@ async def create_chat_completion(raw_request: Request):
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())

# Adapt to vLLM's health endpoint
vllm_api_server.async_engine_client = engine

uvicorn.run(
app,
host=args.host,
Expand Down

0 comments on commit 2ade257

Please sign in to comment.