update
This commit is contained in:
13
vllm_old/entrypoints/cli/__init__.py
Normal file
13
vllm_old/entrypoints/cli/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
|
||||
from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand
|
||||
|
||||
__all__: list[str] = [
|
||||
"BenchmarkLatencySubcommand",
|
||||
"BenchmarkServingSubcommand",
|
||||
"BenchmarkSweepSubcommand",
|
||||
"BenchmarkThroughputSubcommand",
|
||||
]
|
||||
BIN
vllm_old/entrypoints/cli/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_old/entrypoints/cli/__pycache__/collect_env.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/collect_env.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_old/entrypoints/cli/__pycache__/main.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/main.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_old/entrypoints/cli/__pycache__/openai.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/openai.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_old/entrypoints/cli/__pycache__/run_batch.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/run_batch.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_old/entrypoints/cli/__pycache__/serve.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/serve.cpython-312.pyc
Normal file
Binary file not shown.
BIN
vllm_old/entrypoints/cli/__pycache__/types.cpython-312.pyc
Normal file
BIN
vllm_old/entrypoints/cli/__pycache__/types.cpython-312.pyc
Normal file
Binary file not shown.
0
vllm_old/entrypoints/cli/benchmark/__init__.py
Normal file
0
vllm_old/entrypoints/cli/benchmark/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
25
vllm_old/entrypoints/cli/benchmark/base.py
Normal file
25
vllm_old/entrypoints/cli/benchmark/base.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
|
||||
class BenchmarkSubcommandBase(CLISubcommand):
|
||||
"""The base class of subcommands for `vllm bench`."""
|
||||
|
||||
help: str
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
"""Add the CLI arguments to the parser."""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
"""Run the benchmark.
|
||||
|
||||
Args:
|
||||
args: The arguments to the command.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
21
vllm_old/entrypoints/cli/benchmark/latency.py
Normal file
21
vllm_old/entrypoints/cli/benchmark/latency.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.latency import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
|
||||
"""The `latency` subcommand for `vllm bench`."""
|
||||
|
||||
name = "latency"
|
||||
help = "Benchmark the latency of a single batch of requests."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
56
vllm_old/entrypoints/cli/benchmark/main.py
Normal file
56
vllm_old/entrypoints/cli/benchmark/main.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import typing
|
||||
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
class BenchmarkSubcommand(CLISubcommand):
|
||||
"""The `bench` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "bench"
|
||||
help = "vLLM bench subcommand."
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
args.dispatch_function(args)
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
pass
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
bench_parser = subparsers.add_parser(
|
||||
self.name,
|
||||
description=self.help,
|
||||
usage=f"vllm {self.name} <bench_type> [options]",
|
||||
)
|
||||
bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
|
||||
|
||||
for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
|
||||
cmd_subparser = bench_subparsers.add_parser(
|
||||
cmd_cls.name,
|
||||
help=cmd_cls.help,
|
||||
description=cmd_cls.help,
|
||||
usage=f"vllm {self.name} {cmd_cls.name} [options]",
|
||||
)
|
||||
cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
|
||||
cmd_cls.add_cli_args(cmd_subparser)
|
||||
cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
|
||||
subcmd=f"{self.name} {cmd_cls.name}"
|
||||
)
|
||||
return bench_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [BenchmarkSubcommand()]
|
||||
21
vllm_old/entrypoints/cli/benchmark/serve.py
Normal file
21
vllm_old/entrypoints/cli/benchmark/serve.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.serve import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `serve` subcommand for `vllm bench`."""
|
||||
|
||||
name = "serve"
|
||||
help = "Benchmark the online serving throughput."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
21
vllm_old/entrypoints/cli/benchmark/sweep.py
Normal file
21
vllm_old/entrypoints/cli/benchmark/sweep.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.sweep.cli import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkSweepSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `sweep` subcommand for `vllm bench`."""
|
||||
|
||||
name = "sweep"
|
||||
help = "Benchmark for a parameter sweep."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
21
vllm_old/entrypoints/cli/benchmark/throughput.py
Normal file
21
vllm_old/entrypoints/cli/benchmark/throughput.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
|
||||
from vllm.benchmarks.throughput import add_cli_args, main
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
|
||||
|
||||
class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase):
|
||||
"""The `throughput` subcommand for `vllm bench`."""
|
||||
|
||||
name = "throughput"
|
||||
help = "Benchmark offline inference throughput."
|
||||
|
||||
@classmethod
|
||||
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
|
||||
add_cli_args(parser)
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
38
vllm_old/entrypoints/cli/collect_env.py
Normal file
38
vllm_old/entrypoints/cli/collect_env.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import typing
|
||||
|
||||
from vllm.collect_env import main as collect_env_main
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
class CollectEnvSubcommand(CLISubcommand):
|
||||
"""The `collect-env` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "collect-env"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
"""Collect information about the environment."""
|
||||
collect_env_main()
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
return subparsers.add_parser(
|
||||
"collect-env",
|
||||
help="Start collecting environment information.",
|
||||
description="Start collecting environment information.",
|
||||
usage="vllm collect-env",
|
||||
)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [CollectEnvSubcommand()]
|
||||
79
vllm_old/entrypoints/cli/main.py
Normal file
79
vllm_old/entrypoints/cli/main.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""The CLI entrypoints of vLLM
|
||||
|
||||
Note that all future modules must be lazily loaded within main
|
||||
to avoid certain eager import breakage."""
|
||||
|
||||
import importlib.metadata
|
||||
import sys
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
import vllm.entrypoints.cli.benchmark.main
|
||||
import vllm.entrypoints.cli.collect_env
|
||||
import vllm.entrypoints.cli.openai
|
||||
import vllm.entrypoints.cli.run_batch
|
||||
import vllm.entrypoints.cli.serve
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG, cli_env_setup
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
CMD_MODULES = [
|
||||
vllm.entrypoints.cli.openai,
|
||||
vllm.entrypoints.cli.serve,
|
||||
vllm.entrypoints.cli.benchmark.main,
|
||||
vllm.entrypoints.cli.collect_env,
|
||||
vllm.entrypoints.cli.run_batch,
|
||||
]
|
||||
|
||||
cli_env_setup()
|
||||
|
||||
# For 'vllm bench *': use CPU instead of UnspecifiedPlatform by default
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "bench":
|
||||
logger.debug(
|
||||
"Bench command detected, must ensure current platform is not "
|
||||
"UnspecifiedPlatform to avoid device type inference error"
|
||||
)
|
||||
from vllm import platforms
|
||||
|
||||
if platforms.current_platform.is_unspecified():
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
|
||||
platforms.current_platform = CpuPlatform()
|
||||
logger.info(
|
||||
"Unspecified platform detected, switching to CPU Platform instead."
|
||||
)
|
||||
|
||||
parser = FlexibleArgumentParser(
|
||||
description="vLLM CLI",
|
||||
epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version=importlib.metadata.version("vllm"),
|
||||
)
|
||||
subparsers = parser.add_subparsers(required=False, dest="subparser")
|
||||
cmds = {}
|
||||
for cmd_module in CMD_MODULES:
|
||||
new_cmds = cmd_module.cmd_init()
|
||||
for cmd in new_cmds:
|
||||
cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
|
||||
cmds[cmd.name] = cmd
|
||||
args = parser.parse_args()
|
||||
if args.subparser in cmds:
|
||||
cmds[args.subparser].validate(args)
|
||||
|
||||
if hasattr(args, "dispatch_function"):
|
||||
args.dispatch_function(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
256
vllm_old/entrypoints/cli/openai.py
Normal file
256
vllm_old/entrypoints/cli/openai.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from openai import OpenAI
|
||||
from openai.types.chat import ChatCompletionMessageParam
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
def _register_signal_handlers():
|
||||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTSTP, signal_handler)
|
||||
|
||||
|
||||
def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
|
||||
_register_signal_handlers()
|
||||
|
||||
base_url = args.url
|
||||
api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
|
||||
openai_client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
|
||||
if args.model_name:
|
||||
model_name = args.model_name
|
||||
else:
|
||||
available_models = openai_client.models.list()
|
||||
model_name = available_models.data[0].id
|
||||
|
||||
print(f"Using model: {model_name}")
|
||||
|
||||
return model_name, openai_client
|
||||
|
||||
|
||||
def _print_chat_stream(stream) -> str:
|
||||
output = ""
|
||||
for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.content:
|
||||
output += delta.content
|
||||
print(delta.content, end="", flush=True)
|
||||
print()
|
||||
return output
|
||||
|
||||
|
||||
def _print_completion_stream(stream) -> str:
|
||||
output = ""
|
||||
for chunk in stream:
|
||||
text = chunk.choices[0].text
|
||||
if text is not None:
|
||||
output += text
|
||||
print(text, end="", flush=True)
|
||||
print()
|
||||
return output
|
||||
|
||||
|
||||
def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
|
||||
conversation: list[ChatCompletionMessageParam] = []
|
||||
if system_prompt is not None:
|
||||
conversation.append({"role": "system", "content": system_prompt})
|
||||
|
||||
print("Please enter a message for the chat model:")
|
||||
while True:
|
||||
try:
|
||||
input_message = input("> ")
|
||||
except EOFError:
|
||||
break
|
||||
conversation.append({"role": "user", "content": input_message})
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name, messages=conversation, stream=True
|
||||
)
|
||||
output = _print_chat_stream(stream)
|
||||
conversation.append({"role": "assistant", "content": output})
|
||||
|
||||
|
||||
def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
parser.add_argument(
|
||||
"--url",
|
||||
type=str,
|
||||
default="http://localhost:8000/v1",
|
||||
help="url of the running OpenAI-Compatible RESTful API server",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The model name used in prompt completion, default to "
|
||||
"the first model in list models API call."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"API key for OpenAI services. If provided, this api key "
|
||||
"will overwrite the api key obtained through environment variables."
|
||||
),
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
class ChatCommand(CLISubcommand):
|
||||
"""The `chat` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "chat"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
model_name, client = _interactive_cli(args)
|
||||
system_prompt = args.system_prompt
|
||||
conversation: list[ChatCompletionMessageParam] = []
|
||||
|
||||
if system_prompt is not None:
|
||||
conversation.append({"role": "system", "content": system_prompt})
|
||||
|
||||
if args.quick:
|
||||
conversation.append({"role": "user", "content": args.quick})
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name, messages=conversation, stream=True
|
||||
)
|
||||
output = _print_chat_stream(stream)
|
||||
conversation.append({"role": "assistant", "content": output})
|
||||
return
|
||||
|
||||
print("Please enter a message for the chat model:")
|
||||
while True:
|
||||
try:
|
||||
input_message = input("> ")
|
||||
except EOFError:
|
||||
break
|
||||
conversation.append({"role": "user", "content": input_message})
|
||||
|
||||
stream = client.chat.completions.create(
|
||||
model=model_name, messages=conversation, stream=True
|
||||
)
|
||||
output = _print_chat_stream(stream)
|
||||
conversation.append({"role": "assistant", "content": output})
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
"""Add CLI arguments for the chat command."""
|
||||
_add_query_options(parser)
|
||||
parser.add_argument(
|
||||
"--system-prompt",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The system prompt to be added to the chat template, "
|
||||
"used for models that support system prompts."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"-q",
|
||||
"--quick",
|
||||
type=str,
|
||||
metavar="MESSAGE",
|
||||
help=("Send a single prompt as MESSAGE and print the response, then exit."),
|
||||
)
|
||||
return parser
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
parser = subparsers.add_parser(
|
||||
"chat",
|
||||
help="Generate chat completions via the running API server.",
|
||||
description="Generate chat completions via the running API server.",
|
||||
usage="vllm chat [options]",
|
||||
)
|
||||
return ChatCommand.add_cli_args(parser)
|
||||
|
||||
|
||||
class CompleteCommand(CLISubcommand):
|
||||
"""The `complete` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "complete"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
model_name, client = _interactive_cli(args)
|
||||
|
||||
kwargs = {
|
||||
"model": model_name,
|
||||
"stream": True,
|
||||
}
|
||||
if args.max_tokens:
|
||||
kwargs["max_tokens"] = args.max_tokens
|
||||
|
||||
if args.quick:
|
||||
stream = client.completions.create(prompt=args.quick, **kwargs)
|
||||
_print_completion_stream(stream)
|
||||
return
|
||||
|
||||
print("Please enter prompt to complete:")
|
||||
while True:
|
||||
try:
|
||||
input_prompt = input("> ")
|
||||
except EOFError:
|
||||
break
|
||||
stream = client.completions.create(prompt=input_prompt, **kwargs)
|
||||
_print_completion_stream(stream)
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
|
||||
"""Add CLI arguments for the complete command."""
|
||||
_add_query_options(parser)
|
||||
parser.add_argument(
|
||||
"--max-tokens",
|
||||
type=int,
|
||||
help="Maximum number of tokens to generate per output sequence.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-q",
|
||||
"--quick",
|
||||
type=str,
|
||||
metavar="PROMPT",
|
||||
help="Send a single prompt and print the completion output, then exit.",
|
||||
)
|
||||
return parser
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
parser = subparsers.add_parser(
|
||||
"complete",
|
||||
help=(
|
||||
"Generate text completions based on the given prompt "
|
||||
"via the running API server."
|
||||
),
|
||||
description=(
|
||||
"Generate text completions based on the given prompt "
|
||||
"via the running API server."
|
||||
),
|
||||
usage="vllm complete [options]",
|
||||
)
|
||||
return CompleteCommand.add_cli_args(parser)
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [ChatCommand(), CompleteCommand()]
|
||||
68
vllm_old/entrypoints/cli/run_batch.py
Normal file
68
vllm_old/entrypoints/cli/run_batch.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import importlib.metadata
|
||||
import typing
|
||||
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class RunBatchSubcommand(CLISubcommand):
|
||||
"""The `run-batch` subcommand for vLLM CLI."""
|
||||
|
||||
name = "run-batch"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
from vllm.entrypoints.openai.run_batch import main as run_batch_main
|
||||
|
||||
logger.info(
|
||||
"vLLM batch processing API version %s", importlib.metadata.version("vllm")
|
||||
)
|
||||
logger.info("args: %s", args)
|
||||
|
||||
# Start the Prometheus metrics server.
|
||||
# LLMEngine uses the Prometheus client
|
||||
# to publish metrics at the /metrics endpoint.
|
||||
if args.enable_metrics:
|
||||
from prometheus_client import start_http_server
|
||||
|
||||
logger.info("Prometheus metrics enabled")
|
||||
start_http_server(port=args.port, addr=args.url)
|
||||
else:
|
||||
logger.info("Prometheus metrics disabled")
|
||||
|
||||
asyncio.run(run_batch_main(args))
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
from vllm.entrypoints.openai.run_batch import make_arg_parser
|
||||
|
||||
run_batch_parser = subparsers.add_parser(
|
||||
self.name,
|
||||
help="Run batch prompts and write results to file.",
|
||||
description=(
|
||||
"Run batch prompts using vLLM's OpenAI-compatible API.\n"
|
||||
"Supports local or HTTP input/output files."
|
||||
),
|
||||
usage="vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model <model>",
|
||||
)
|
||||
run_batch_parser = make_arg_parser(run_batch_parser)
|
||||
run_batch_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name)
|
||||
return run_batch_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [RunBatchSubcommand()]
|
||||
249
vllm_old/entrypoints/cli/serve.py
Normal file
249
vllm_old/entrypoints/cli/serve.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import signal
|
||||
|
||||
import uvloop
|
||||
|
||||
import vllm
|
||||
import vllm.envs as envs
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
run_server,
|
||||
run_server_worker,
|
||||
setup_server,
|
||||
)
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
|
||||
from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
|
||||
from vllm.logger import init_logger
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.network_utils import get_tcp_uri
|
||||
from vllm.utils.system_utils import decorate_logs, set_process_title
|
||||
from vllm.v1.engine.core import EngineCoreProc
|
||||
from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
|
||||
from vllm.v1.executor import Executor
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
|
||||
from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
DESCRIPTION = """Launch a local OpenAI-compatible API server to serve LLM
|
||||
completions via HTTP. Defaults to Qwen/Qwen3-0.6B if no model is specified.
|
||||
|
||||
Search by using: `--help=<ConfigGroup>` to explore options by section (e.g.,
|
||||
--help=ModelConfig, --help=Frontend)
|
||||
Use `--help=all` to show all available flags at once.
|
||||
"""
|
||||
|
||||
|
||||
class ServeSubcommand(CLISubcommand):
|
||||
"""The `serve` subcommand for the vLLM CLI."""
|
||||
|
||||
name = "serve"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
# If model is specified in CLI (as positional arg), it takes precedence
|
||||
if hasattr(args, "model_tag") and args.model_tag is not None:
|
||||
args.model = args.model_tag
|
||||
|
||||
if args.headless or args.api_server_count < 1:
|
||||
run_headless(args)
|
||||
else:
|
||||
if args.api_server_count > 1:
|
||||
run_multi_api_server(args)
|
||||
else:
|
||||
# Single API server (this process).
|
||||
uvloop.run(run_server(args))
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
validate_parsed_serve_args(args)
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
serve_parser = subparsers.add_parser(
|
||||
self.name, description=DESCRIPTION, usage="vllm serve [model_tag] [options]"
|
||||
)
|
||||
|
||||
serve_parser = make_arg_parser(serve_parser)
|
||||
serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name)
|
||||
return serve_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [ServeSubcommand()]
|
||||
|
||||
|
||||
def run_headless(args: argparse.Namespace):
|
||||
if args.api_server_count > 1:
|
||||
raise ValueError("api_server_count can't be set in headless mode")
|
||||
|
||||
# Create the EngineConfig.
|
||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(
|
||||
usage_context=usage_context, headless=True
|
||||
)
|
||||
|
||||
if engine_args.data_parallel_hybrid_lb:
|
||||
raise ValueError("data_parallel_hybrid_lb is not applicable in headless mode")
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
local_engine_count = parallel_config.data_parallel_size_local
|
||||
|
||||
if local_engine_count <= 0:
|
||||
raise ValueError("data_parallel_size_local must be > 0 in headless mode")
|
||||
|
||||
shutdown_requested = False
|
||||
|
||||
# Catch SIGTERM and SIGINT to allow graceful shutdown.
|
||||
def signal_handler(signum, frame):
|
||||
nonlocal shutdown_requested
|
||||
logger.debug("Received %d signal.", signum)
|
||||
if not shutdown_requested:
|
||||
shutdown_requested = True
|
||||
raise SystemExit
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
if parallel_config.node_rank_within_dp > 0:
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
# Run headless workers (for multi-node PP/TP).
|
||||
host = parallel_config.master_addr
|
||||
head_node_address = f"{host}:{parallel_config.master_port}"
|
||||
logger.info(
|
||||
"Launching vLLM (v%s) headless multiproc executor, "
|
||||
"with head node address %s for torch.distributed process group.",
|
||||
VLLM_VERSION,
|
||||
head_node_address,
|
||||
)
|
||||
|
||||
executor = MultiprocExecutor(vllm_config, monitor_workers=False)
|
||||
executor.start_worker_monitor(inline=True)
|
||||
return
|
||||
|
||||
host = parallel_config.data_parallel_master_ip
|
||||
port = parallel_config.data_parallel_rpc_port
|
||||
handshake_address = get_tcp_uri(host, port)
|
||||
|
||||
logger.info(
|
||||
"Launching %d data parallel engine(s) in headless mode, "
|
||||
"with head node address %s.",
|
||||
local_engine_count,
|
||||
handshake_address,
|
||||
)
|
||||
|
||||
# Create the engines.
|
||||
engine_manager = CoreEngineProcManager(
|
||||
target_fn=EngineCoreProc.run_engine_core,
|
||||
local_engine_count=local_engine_count,
|
||||
start_index=vllm_config.parallel_config.data_parallel_rank,
|
||||
local_start_index=0,
|
||||
vllm_config=vllm_config,
|
||||
local_client=False,
|
||||
handshake_address=handshake_address,
|
||||
executor_class=Executor.get_class(vllm_config),
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
)
|
||||
|
||||
try:
|
||||
engine_manager.join_first()
|
||||
finally:
|
||||
logger.info("Shutting down.")
|
||||
engine_manager.close()
|
||||
|
||||
|
||||
def run_multi_api_server(args: argparse.Namespace):
|
||||
assert not args.headless
|
||||
num_api_servers: int = args.api_server_count
|
||||
assert num_api_servers > 0
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
listen_address, sock = setup_server(args)
|
||||
|
||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||
engine_args._api_process_count = num_api_servers
|
||||
engine_args._api_process_rank = -1
|
||||
|
||||
usage_context = UsageContext.OPENAI_API_SERVER
|
||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
||||
|
||||
if num_api_servers > 1 and envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
|
||||
raise ValueError(
|
||||
"VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used with api_server_count > 1"
|
||||
)
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
log_stats = not engine_args.disable_log_stats
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
dp_rank = parallel_config.data_parallel_rank
|
||||
external_dp_lb = parallel_config.data_parallel_external_lb
|
||||
hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb
|
||||
assert external_dp_lb or hybrid_dp_lb or dp_rank == 0
|
||||
|
||||
api_server_manager: APIServerProcessManager | None = None
|
||||
|
||||
with launch_core_engines(
|
||||
vllm_config, executor_class, log_stats, num_api_servers
|
||||
) as (local_engine_manager, coordinator, addresses):
|
||||
# Construct common args for the APIServerProcessManager up-front.
|
||||
api_server_manager_kwargs = dict(
|
||||
target_server_fn=run_api_server_worker_proc,
|
||||
listen_address=listen_address,
|
||||
sock=sock,
|
||||
args=args,
|
||||
num_servers=num_api_servers,
|
||||
input_addresses=addresses.inputs,
|
||||
output_addresses=addresses.outputs,
|
||||
stats_update_address=coordinator.get_stats_publish_address()
|
||||
if coordinator
|
||||
else None,
|
||||
)
|
||||
|
||||
# For dp ranks > 0 in external/hybrid DP LB modes, we must delay the
|
||||
# start of the API servers until the local engine is started
|
||||
# (after the launcher context manager exits),
|
||||
# since we get the front-end stats update address from the coordinator
|
||||
# via the handshake with the local engine.
|
||||
if dp_rank == 0 or not (external_dp_lb or hybrid_dp_lb):
|
||||
# Start API servers using the manager.
|
||||
api_server_manager = APIServerProcessManager(**api_server_manager_kwargs)
|
||||
|
||||
# Start API servers now if they weren't already started.
|
||||
if api_server_manager is None:
|
||||
api_server_manager_kwargs["stats_update_address"] = (
|
||||
addresses.frontend_stats_publish_address
|
||||
)
|
||||
api_server_manager = APIServerProcessManager(**api_server_manager_kwargs)
|
||||
|
||||
# Wait for API servers
|
||||
wait_for_completion_or_failure(
|
||||
api_server_manager=api_server_manager,
|
||||
engine_manager=local_engine_manager,
|
||||
coordinator=coordinator,
|
||||
)
|
||||
|
||||
|
||||
def run_api_server_worker_proc(
|
||||
listen_address, sock, args, client_config=None, **uvicorn_kwargs
|
||||
) -> None:
|
||||
"""Entrypoint for individual API server worker processes."""
|
||||
client_config = client_config or {}
|
||||
server_index = client_config.get("client_index", 0)
|
||||
|
||||
# Set process title and add process-specific prefix to stdout and stderr.
|
||||
set_process_title("APIServer", str(server_index))
|
||||
decorate_logs()
|
||||
|
||||
uvloop.run(
|
||||
run_server_worker(listen_address, sock, args, client_config, **uvicorn_kwargs)
|
||||
)
|
||||
29
vllm_old/entrypoints/cli/types.py
Normal file
29
vllm_old/entrypoints/cli/types.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import typing
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
|
||||
class CLISubcommand:
|
||||
"""Base class for CLI argument handlers."""
|
||||
|
||||
name: str
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
raise NotImplementedError("Subclasses should implement this method")
|
||||
|
||||
def validate(self, args: argparse.Namespace) -> None:
|
||||
# No validation by default
|
||||
pass
|
||||
|
||||
def subparser_init(
|
||||
self, subparsers: argparse._SubParsersAction
|
||||
) -> FlexibleArgumentParser:
|
||||
raise NotImplementedError("Subclasses should implement this method")
|
||||
Reference in New Issue
Block a user