init
This commit is contained in:
0
entrypoints/cli/__init__.py
Normal file
0
entrypoints/cli/__init__.py
Normal file
0
entrypoints/cli/benchmark/__init__.py
Normal file
0
entrypoints/cli/benchmark/__init__.py
Normal file
39
entrypoints/cli/benchmark/base.py
Normal file
39
entrypoints/cli/benchmark/base.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class BenchmarkSubcommandBase(CLISubcommand):
|
||||
""" The base class of subcommands for vllm bench. """
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
"""The help message of the subcommand."""
|
||||
raise NotImplementedError
|
||||
|
||||
def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
"""Add the CLI arguments to the parser."""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
"""Run the benchmark.
|
||||
|
||||
Args:
|
||||
args: The arguments to the command.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
parser = subparsers.add_parser(
|
||||
self.name,
|
||||
help=self.help,
|
||||
description=self.help,
|
||||
usage=f"vllm bench {self.name} [options]")
|
||||
self.add_cli_args(parser)
|
||||
return parser
|
||||
30
entrypoints/cli/benchmark/latency.py
Normal file
30
entrypoints/cli/benchmark/latency.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.latency import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
|
||||
class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
|
||||
""" The `latency` subcommand for vllm bench. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "latency"
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Benchmark the latency of a single batch of requests."
|
||||
|
||||
def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [BenchmarkLatencySubcommand()]
|
||||
54
entrypoints/cli/benchmark/main.py
Normal file
54
entrypoints/cli/benchmark/main.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
import vllm.entrypoints.cli.benchmark.latency
|
||||
import vllm.entrypoints.cli.benchmark.serve
|
||||
import vllm.entrypoints.cli.benchmark.throughput
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
BENCHMARK_CMD_MODULES = [
|
||||
vllm.entrypoints.cli.benchmark.latency,
|
||||
vllm.entrypoints.cli.benchmark.serve,
|
||||
vllm.entrypoints.cli.benchmark.throughput,
|
||||
]
|
||||
|
||||
|
||||
class BenchmarkSubcommand(CLISubcommand):
|
||||
""" The `bench` subcommand for the vLLM CLI. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "bench"
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
args.dispatch_function(args)
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
if args.bench_type in self.cmds:
|
||||
self.cmds[args.bench_type].validate(args)
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
bench_parser = subparsers.add_parser(
|
||||
"bench",
|
||||
help="vLLM bench subcommand.",
|
||||
description="vLLM bench subcommand.",
|
||||
usage="vllm bench <bench_type> [options]")
|
||||
bench_subparsers = bench_parser.add_subparsers(required=True,
|
||||
dest="bench_type")
|
||||
self.cmds = {}
|
||||
for cmd_module in BENCHMARK_CMD_MODULES:
|
||||
new_cmds = cmd_module.cmd_init()
|
||||
for cmd in new_cmds:
|
||||
cmd.subparser_init(bench_subparsers).set_defaults(
|
||||
dispatch_function=cmd.cmd)
|
||||
self.cmds[cmd.name] = cmd
|
||||
return bench_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [BenchmarkSubcommand()]
|
||||
30
entrypoints/cli/benchmark/serve.py
Normal file
30
entrypoints/cli/benchmark/serve.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.serve import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
|
||||
class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
|
||||
""" The `serve` subcommand for vllm bench. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "serve"
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Benchmark the online serving throughput."
|
||||
|
||||
def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [BenchmarkServingSubcommand()]
|
||||
30
entrypoints/cli/benchmark/throughput.py
Normal file
30
entrypoints/cli/benchmark/throughput.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.throughput import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
|
||||
class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
|
||||
""" The `throughput` subcommand for vllm bench. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "throughput"
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Benchmark offline inference throughput."
|
||||
|
||||
def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [BenchmarkThroughputSubcommand()]
|
||||
35
entrypoints/cli/collect_env.py
Normal file
35
entrypoints/cli/collect_env.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
from vllm.collect_env import main as collect_env_main
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class CollectEnvSubcommand(CLISubcommand):
|
||||
"""The `collect-env` subcommand for the vLLM CLI. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "collect-env"
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
"""Collect information about the environment."""
|
||||
collect_env_main()
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
collect_env_parser = subparsers.add_parser(
|
||||
"collect-env",
|
||||
help="Start collecting environment information.",
|
||||
description="Start collecting environment information.",
|
||||
usage="vllm collect-env")
|
||||
return collect_env_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [CollectEnvSubcommand()]
|
||||
65
entrypoints/cli/main.py
Normal file
65
entrypoints/cli/main.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# The CLI entrypoint to vLLM.
|
||||
import signal
|
||||
import sys
|
||||
|
||||
import vllm.entrypoints.cli.benchmark.main
|
||||
import vllm.entrypoints.cli.collect_env
|
||||
import vllm.entrypoints.cli.openai
|
||||
import vllm.entrypoints.cli.run_batch
|
||||
import vllm.entrypoints.cli.serve
|
||||
import vllm.version
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
CMD_MODULES = [
|
||||
vllm.entrypoints.cli.openai,
|
||||
vllm.entrypoints.cli.serve,
|
||||
vllm.entrypoints.cli.benchmark.main,
|
||||
vllm.entrypoints.cli.collect_env,
|
||||
vllm.entrypoints.cli.run_batch,
|
||||
]
|
||||
|
||||
|
||||
def register_signal_handlers():
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTSTP, signal_handler)
|
||||
|
||||
|
||||
def main():
|
||||
cli_env_setup()
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="vLLM CLI",
|
||||
epilog=VLLM_SUBCMD_PARSER_EPILOG,
|
||||
)
|
||||
parser.add_argument('-v',
|
||||
'--version',
|
||||
action='version',
|
||||
version=vllm.version.__version__)
|
||||
subparsers = parser.add_subparsers(required=False, dest="subparser")
|
||||
cmds = {}
|
||||
for cmd_module in CMD_MODULES:
|
||||
new_cmds = cmd_module.cmd_init()
|
||||
for cmd in new_cmds:
|
||||
cmd.subparser_init(subparsers).set_defaults(
|
||||
dispatch_function=cmd.cmd)
|
||||
cmds[cmd.name] = cmd
|
||||
args = parser.parse_args()
|
||||
if args.subparser in cmds:
|
||||
cmds[args.subparser].validate(args)
|
||||
|
||||
if hasattr(args, "dispatch_function"):
|
||||
args.dispatch_function(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
205
entrypoints/cli/openai.py
Normal file
205
entrypoints/cli/openai.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Commands that act as an interactive OpenAI API client
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def _register_signal_handlers():
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTSTP, signal_handler)
|
||||
|
||||
|
||||
def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
|
||||
_register_signal_handlers()
|
||||
|
||||
base_url = args.url
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
|
||||
openai_client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
if args.model_name:
|
||||
model_name = args.model_name
|
||||
else:
|
||||
available_models = openai_client.models.list()
|
||||
model_name = available_models.data[0].id
|
||||
|
||||
print(f"Using model: {model_name}")
|
||||
|
||||
return model_name, openai_client
|
||||
|
||||
|
||||
def chat(system_prompt: Optional[str], model_name: str,
|
||||
client: OpenAI) -> None:
|
||||
conversation: list[ChatCompletionMessageParam] = []
|
||||
if system_prompt is not None:
|
||||
conversation.append({"role": "system", "content": system_prompt})
|
||||
|
||||
print("Please enter a message for the chat model:")
|
||||
while True:
|
||||
try:
|
||||
input_message = input("> ")
|
||||
except EOFError:
|
||||
return
|
||||
conversation.append({"role": "user", "content": input_message})
|
||||
|
||||
chat_completion = client.chat.completions.create(model=model_name,
|
||||
messages=conversation)
|
||||
|
||||
response_message = chat_completion.choices[0].message
|
||||
output = response_message.content
|
||||
|
||||
conversation.append(response_message) # type: ignore
|
||||
print(output)
|
||||
|
||||
|
||||
def _add_query_options(
|
||||
parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default="http://localhost:8000/v1",
|
||||
help="url of the running OpenAI-Compatible RESTful API server")
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help=("The model name used in prompt completion, default to "
|
||||
"the first model in list models API call."))
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"API key for OpenAI services. If provided, this api key "
|
||||
"will overwrite the api key obtained through environment variables."
|
||||
))
|
||||
return parser
|
||||
|
||||
|
||||
class ChatCommand(CLISubcommand):
|
||||
"""The `chat` subcommand for the vLLM CLI. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "chat"
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
model_name, client = _interactive_cli(args)
|
||||
system_prompt = args.system_prompt
|
||||
conversation: list[ChatCompletionMessageParam] = []
|
||||
|
||||
if system_prompt is not None:
|
||||
conversation.append({"role": "system", "content": system_prompt})
|
||||
|
||||
if args.quick:
|
||||
conversation.append({"role": "user", "content": args.quick})
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model=model_name, messages=conversation)
|
||||
print(chat_completion.choices[0].message.content)
|
||||
return
|
||||
|
||||
print("Please enter a message for the chat model:")
|
||||
while True:
|
||||
try:
|
||||
input_message = input("> ")
|
||||
except EOFError:
|
||||
return
|
||||
conversation.append({"role": "user", "content": input_message})
|
||||
|
||||
chat_completion = client.chat.completions.create(
|
||||
model=model_name, messages=conversation)
|
||||
|
||||
response_message = chat_completion.choices[0].message
|
||||
output = response_message.content
|
||||
|
||||
conversation.append(response_message) # type: ignore
|
||||
print(output)
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
chat_parser = subparsers.add_parser(
|
||||
"chat",
|
||||
help="Generate chat completions via the running API server.",
|
||||
description="Generate chat completions via the running API server.",
|
||||
usage="vllm chat [options]")
|
||||
_add_query_options(chat_parser)
|
||||
chat_parser.add_argument(
|
||||
"--system-prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
help=("The system prompt to be added to the chat template, "
|
||||
"used for models that support system prompts."))
|
||||
chat_parser.add_argument("-q",
|
||||
"--quick",
|
||||
type=str,
|
||||
metavar="MESSAGE",
|
||||
help=("Send a single prompt as MESSAGE "
|
||||
"and print the response, then exit."))
|
||||
return chat_parser
|
||||
|
||||
|
||||
class CompleteCommand(CLISubcommand):
|
||||
"""The `complete` subcommand for the vLLM CLI. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "complete"
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
model_name, client = _interactive_cli(args)
|
||||
|
||||
if args.quick:
|
||||
completion = client.completions.create(model=model_name,
|
||||
prompt=args.quick)
|
||||
print(completion.choices[0].text)
|
||||
return
|
||||
|
||||
print("Please enter prompt to complete:")
|
||||
while True:
|
||||
input_prompt = input("> ")
|
||||
completion = client.completions.create(model=model_name,
|
||||
prompt=input_prompt)
|
||||
output = completion.choices[0].text
|
||||
print(output)
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
complete_parser = subparsers.add_parser(
|
||||
"complete",
|
||||
help=("Generate text completions based on the given prompt "
|
||||
"via the running API server."),
|
||||
description=("Generate text completions based on the given prompt "
|
||||
"via the running API server."),
|
||||
usage="vllm complete [options]")
|
||||
_add_query_options(complete_parser)
|
||||
complete_parser.add_argument(
|
||||
"-q",
|
||||
"--quick",
|
||||
type=str,
|
||||
metavar="PROMPT",
|
||||
help=
|
||||
"Send a single prompt and print the completion output, then exit.")
|
||||
return complete_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [ChatCommand(), CompleteCommand()]
|
||||
62
entrypoints/cli/run_batch.py
Normal file
62
entrypoints/cli/run_batch.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from prometheus_client import start_http_server
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.logger import logger
|
||||
from vllm.entrypoints.openai.run_batch import main as run_batch_main
|
||||
from vllm.entrypoints.openai.run_batch import make_arg_parser
|
||||
from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
|
||||
show_filtered_argument_or_group_from_help)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
|
||||
class RunBatchSubcommand(CLISubcommand):
|
||||
"""The `run-batch` subcommand for vLLM CLI."""
|
||||
|
||||
def __init__(self):
|
||||
self.name = "run-batch"
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
logger.info("vLLM batch processing API version %s", VLLM_VERSION)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
# Start the Prometheus metrics server.
|
||||
# LLMEngine uses the Prometheus client
|
||||
# to publish metrics at the /metrics endpoint.
|
||||
if args.enable_metrics:
|
||||
logger.info("Prometheus metrics enabled")
|
||||
start_http_server(port=args.port, addr=args.url)
|
||||
else:
|
||||
logger.info("Prometheus metrics disabled")
|
||||
|
||||
asyncio.run(run_batch_main(args))
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
run_batch_parser = subparsers.add_parser(
|
||||
"run-batch",
|
||||
help="Run batch prompts and write results to file.",
|
||||
description=(
|
||||
"Run batch prompts using vLLM's OpenAI-compatible API.\n"
|
||||
"Supports local or HTTP input/output files."),
|
||||
usage=
|
||||
"vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
|
||||
)
|
||||
run_batch_parser = make_arg_parser(run_batch_parser)
|
||||
show_filtered_argument_or_group_from_help(run_batch_parser,
|
||||
"run-batch")
|
||||
run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
|
||||
return run_batch_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [RunBatchSubcommand()]
|
||||
328
entrypoints/cli/serve.py
Normal file
328
entrypoints/cli/serve.py
Normal file
@@ -0,0 +1,328 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
|
||||
import uvloop
|
||||
import zmq
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import AsyncEngineArgs
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.openai.api_server import (run_server, run_server_worker,
|
||||
setup_server)
|
||||
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
|
||||
validate_parsed_serve_args)
|
||||
from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG,
|
||||
show_filtered_argument_or_group_from_help)
|
||||
from vllm.executor.multiproc_worker_utils import _add_prefix
|
||||
from vllm.logger import init_logger
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser, get_tcp_uri, zmq_socket_ctx
|
||||
from vllm.v1.engine.coordinator import DPCoordinator
|
||||
from vllm.v1.engine.core import EngineCoreProc
|
||||
from vllm.v1.engine.core_client import CoreEngineProcManager
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
|
||||
from vllm.v1.utils import (APIServerProcessManager, CoreEngine,
|
||||
CoreEngineActorManager, EngineZmqAddresses,
|
||||
get_engine_client_zmq_addr,
|
||||
wait_for_completion_or_failure,
|
||||
wait_for_engine_startup)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ServeSubcommand(CLISubcommand):
|
||||
"""The `serve` subcommand for the vLLM CLI. """
|
||||
|
||||
def __init__(self):
|
||||
self.name = "serve"
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
# If model is specified in CLI (as positional arg), it takes precedence
|
||||
if hasattr(args, 'model_tag') and args.model_tag is not None:
|
||||
args.model = args.model_tag
|
||||
|
||||
if args.headless or args.api_server_count < 1:
|
||||
run_headless(args)
|
||||
elif args.api_server_count > 1:
|
||||
run_multi_api_server(args)
|
||||
else:
|
||||
# Single API server (this process).
|
||||
uvloop.run(run_server(args))
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
serve_parser = subparsers.add_parser(
|
||||
"serve",
|
||||
help="Start the vLLM OpenAI Compatible API server.",
|
||||
description="Start the vLLM OpenAI Compatible API server.",
|
||||
usage="vllm serve [model_tag] [options]")
|
||||
serve_parser.add_argument("model_tag",
|
||||
type=str,
|
||||
nargs='?',
|
||||
help="The model tag to serve "
|
||||
"(optional if specified in config)")
|
||||
serve_parser.add_argument(
|
||||
"--headless",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Run in headless mode. See multi-node data parallel "
|
||||
"documentation for more details.")
|
||||
serve_parser.add_argument(
|
||||
'--data-parallel-start-rank',
|
||||
'-dpr',
|
||||
type=int,
|
||||
default=0,
|
||||
help='Starting data parallel rank for secondary nodes.')
|
||||
serve_parser.add_argument('--api-server-count',
|
||||
'-asc',
|
||||
type=int,
|
||||
default=1,
|
||||
help='How many API server processes to run.')
|
||||
serve_parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default='',
|
||||
required=False,
|
||||
help="Read CLI options from a config file."
|
||||
"Must be a YAML with the following options:"
|
||||
"https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
|
||||
)
|
||||
|
||||
serve_parser = make_arg_parser(serve_parser)
|
||||
show_filtered_argument_or_group_from_help(serve_parser, "serve")
|
||||
serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
|
||||
return serve_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [ServeSubcommand()]
|
||||
|
||||
|
||||
def run_headless(args: argparse.Namespace):
|
||||
|
||||
if args.api_server_count > 1:
|
||||
raise ValueError("api_server_count can't be set in headless mode")
|
||||
|
||||
# Create the EngineConfig.
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise ValueError("Headless mode is only supported for V1")
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
local_engine_count = parallel_config.data_parallel_size_local
|
||||
host = parallel_config.data_parallel_master_ip
|
||||
port = engine_args.data_parallel_rpc_port # add to config too
|
||||
handshake_address = get_tcp_uri(host, port)
|
||||
|
||||
if local_engine_count <= 0:
|
||||
raise ValueError("data_parallel_size_local must be > 0 in "
|
||||
"headless mode")
|
||||
|
||||
# Catch SIGTERM and SIGINT to allow graceful shutdown.
|
||||
def signal_handler(signum, frame):
|
||||
logger.debug("Received %d signal.", signum)
|
||||
raise SystemExit
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
logger.info(
|
||||
"Launching %d data parallel engine(s) in headless mode, "
|
||||
"with head node address %s.", local_engine_count, handshake_address)
|
||||
|
||||
# Create the engines.
|
||||
engine_manager = CoreEngineProcManager(
|
||||
target_fn=EngineCoreProc.run_engine_core,
|
||||
local_engine_count=local_engine_count,
|
||||
start_index=args.data_parallel_start_rank,
|
||||
local_start_index=0,
|
||||
vllm_config=vllm_config,
|
||||
on_head_node=False,
|
||||
handshake_address=handshake_address,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
)
|
||||
|
||||
try:
|
||||
engine_manager.join_first()
|
||||
finally:
|
||||
logger.info("Shutting down.")
|
||||
engine_manager.close()
|
||||
|
||||
|
||||
def run_multi_api_server(args: argparse.Namespace):
|
||||
|
||||
assert not args.headless
|
||||
num_api_servers = args.api_server_count
|
||||
assert num_api_servers > 0
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
listen_address, sock = setup_server(args)
|
||||
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if num_api_servers > 1:
|
||||
if not envs.VLLM_USE_V1:
|
||||
raise ValueError("api_server_count > 1 is only supported for V1")
|
||||
|
||||
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
|
||||
raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used "
|
||||
"with api_server_count > 1")
|
||||
|
||||
if model_config.is_multimodal_model and not (
|
||||
model_config.disable_mm_preprocessor_cache):
|
||||
logger.warning(
|
||||
"Multi-model preprocessor cache will be disabled for"
|
||||
" api_server_count > 1")
|
||||
model_config.disable_mm_preprocessor_cache = True
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
|
||||
assert parallel_config.data_parallel_rank == 0
|
||||
|
||||
dp_size = parallel_config.data_parallel_size
|
||||
local_engine_count = parallel_config.data_parallel_size_local
|
||||
host = parallel_config.data_parallel_master_ip
|
||||
local_only = local_engine_count == dp_size
|
||||
|
||||
# Set up input and output addresses.
|
||||
input_addresses = [
|
||||
get_engine_client_zmq_addr(local_only, host)
|
||||
for _ in range(num_api_servers)
|
||||
]
|
||||
output_addresses = [
|
||||
get_engine_client_zmq_addr(local_only, host)
|
||||
for _ in range(num_api_servers)
|
||||
]
|
||||
|
||||
addresses = EngineZmqAddresses(
|
||||
inputs=input_addresses,
|
||||
outputs=output_addresses,
|
||||
)
|
||||
|
||||
# Set up coordinator for dp > 1.
|
||||
coordinator = None
|
||||
stats_update_address = None
|
||||
if dp_size > 1:
|
||||
coordinator = DPCoordinator(parallel_config)
|
||||
addresses.coordinator_input, addresses.coordinator_output = (
|
||||
coordinator.get_engine_socket_addresses())
|
||||
stats_update_address = coordinator.get_stats_publish_address()
|
||||
logger.info("Started DP Coordinator process (PID: %d)",
|
||||
coordinator.proc.pid)
|
||||
|
||||
if parallel_config.data_parallel_backend == "ray":
|
||||
logger.info("Starting ray-based data parallel backend")
|
||||
|
||||
engine_actor_manager = CoreEngineActorManager(
|
||||
vllm_config=vllm_config,
|
||||
addresses=addresses,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
)
|
||||
# Start API servers using the manager
|
||||
api_server_manager = APIServerProcessManager(
|
||||
target_server_fn=run_api_server_worker_proc,
|
||||
listen_address=listen_address,
|
||||
sock=sock,
|
||||
args=args,
|
||||
num_servers=num_api_servers,
|
||||
input_addresses=input_addresses,
|
||||
output_addresses=output_addresses,
|
||||
stats_update_address=stats_update_address)
|
||||
|
||||
wait_for_completion_or_failure(api_server_manager=api_server_manager,
|
||||
engine_manager=engine_actor_manager,
|
||||
coordinator=coordinator)
|
||||
return
|
||||
|
||||
handshake_address = get_engine_client_zmq_addr(
|
||||
local_only, host, parallel_config.data_parallel_rpc_port)
|
||||
|
||||
with zmq_socket_ctx(handshake_address, zmq.ROUTER,
|
||||
bind=True) as handshake_socket:
|
||||
|
||||
# Start local engines.
|
||||
if not local_engine_count:
|
||||
local_engine_manager = None
|
||||
else:
|
||||
local_engine_manager = CoreEngineProcManager(
|
||||
EngineCoreProc.run_engine_core,
|
||||
vllm_config=vllm_config,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
handshake_address=handshake_address,
|
||||
on_head_node=True,
|
||||
local_engine_count=local_engine_count,
|
||||
start_index=0,
|
||||
local_start_index=0)
|
||||
|
||||
# Start API servers using the manager
|
||||
api_server_manager = APIServerProcessManager(
|
||||
target_server_fn=run_api_server_worker_proc,
|
||||
listen_address=listen_address,
|
||||
sock=sock,
|
||||
args=args,
|
||||
num_servers=num_api_servers,
|
||||
input_addresses=input_addresses,
|
||||
output_addresses=output_addresses,
|
||||
stats_update_address=stats_update_address)
|
||||
|
||||
# Wait for engine handshakes to complete.
|
||||
core_engines = [
|
||||
CoreEngine(index=i, local=(i < local_engine_count))
|
||||
for i in range(dp_size)
|
||||
]
|
||||
wait_for_engine_startup(
|
||||
handshake_socket,
|
||||
addresses,
|
||||
core_engines,
|
||||
parallel_config,
|
||||
vllm_config.cache_config,
|
||||
local_engine_manager,
|
||||
coordinator.proc if coordinator else None,
|
||||
)
|
||||
|
||||
# Wait for API servers
|
||||
wait_for_completion_or_failure(api_server_manager=api_server_manager,
|
||||
engine_manager=local_engine_manager,
|
||||
coordinator=coordinator)
|
||||
|
||||
|
||||
def run_api_server_worker_proc(listen_address,
|
||||
sock,
|
||||
args,
|
||||
client_config=None,
|
||||
**uvicorn_kwargs) -> None:
|
||||
"""Entrypoint for individual API server worker processes."""
|
||||
|
||||
# Add process-specific prefix to stdout and stderr.
|
||||
from multiprocessing import current_process
|
||||
process_name = current_process().name
|
||||
pid = os.getpid()
|
||||
_add_prefix(sys.stdout, process_name, pid)
|
||||
_add_prefix(sys.stderr, process_name, pid)
|
||||
|
||||
uvloop.run(
|
||||
run_server_worker(listen_address, sock, args, client_config,
|
||||
**uvicorn_kwargs))
|
||||
25
entrypoints/cli/types.py
Normal file
25
entrypoints/cli/types.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class CLISubcommand:
|
||||
"""Base class for CLI argument handlers."""
|
||||
|
||||
name: str
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
raise NotImplementedError("Subclasses should implement this method")
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
# No validation by default
|
||||
pass
|
||||
|
||||
def subparser_init(
|
||||
self,
|
||||
subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
raise NotImplementedError("Subclasses should implement this method")
|
||||
Reference in New Issue
Block a user