Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -1,12 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
|
||||
from vllm.entrypoints.cli.benchmark.mm_processor import (
|
||||
BenchmarkMMProcessorSubcommand,
|
||||
)
|
||||
from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
|
||||
from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand
|
||||
|
||||
# Keep this package init import-free.
|
||||
#
|
||||
# The `vllm` console script imports `vllm.entrypoints.cli.main`, which causes
|
||||
# Python to import this package before loading the `main` submodule.
|
||||
# Eagerly importing benchmark subcommands here makes every `vllm serve ...`
|
||||
# startup depend on optional benchmark-only modules.
|
||||
#
|
||||
# Benchmark subcommands are loaded on demand in
|
||||
# `vllm.entrypoints.cli.benchmark.main`.
|
||||
__all__: list[str] = [
|
||||
"BenchmarkLatencySubcommand",
|
||||
"BenchmarkMMProcessorSubcommand",
|
||||
"BenchmarkServingSubcommand",
|
||||
"BenchmarkStartupSubcommand",
|
||||
"BenchmarkSweepSubcommand",
|
||||
"BenchmarkThroughputSubcommand",
|
||||
]
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import importlib
|
||||
import logging
|
||||
import typing
|
||||
|
||||
from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
|
||||
@@ -15,30 +13,6 @@ if typing.TYPE_CHECKING:
|
||||
else:
|
||||
FlexibleArgumentParser = argparse.ArgumentParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _load_benchmark_subcommands() -> None:
|
||||
modules = [
|
||||
"vllm.entrypoints.cli.benchmark.latency",
|
||||
"vllm.entrypoints.cli.benchmark.mm_processor",
|
||||
"vllm.entrypoints.cli.benchmark.serve",
|
||||
"vllm.entrypoints.cli.benchmark.startup",
|
||||
"vllm.entrypoints.cli.benchmark.sweep",
|
||||
"vllm.entrypoints.cli.benchmark.throughput",
|
||||
]
|
||||
|
||||
for module_name in modules:
|
||||
try:
|
||||
importlib.import_module(module_name)
|
||||
except ModuleNotFoundError as e:
|
||||
logger.warning(
|
||||
"Skipping benchmark subcommand module %s because an optional "
|
||||
"dependency could not be imported: %r",
|
||||
module_name,
|
||||
e,
|
||||
)
|
||||
|
||||
|
||||
class BenchmarkSubcommand(CLISubcommand):
|
||||
"""The `bench` subcommand for the vLLM CLI."""
|
||||
@@ -64,8 +38,6 @@ class BenchmarkSubcommand(CLISubcommand):
|
||||
)
|
||||
bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
|
||||
|
||||
_load_benchmark_subcommands()
|
||||
|
||||
for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
|
||||
cmd_subparser = bench_subparsers.add_parser(
|
||||
cmd_cls.name,
|
||||
|
||||
@@ -220,6 +220,12 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
num_api_servers: int = args.api_server_count
|
||||
assert num_api_servers > 0
|
||||
|
||||
if num_api_servers > 1 and getattr(args, "use_gpu_for_pooling_score", False):
|
||||
# TODO(wentao): remove this once well tested
|
||||
raise ValueError(
|
||||
"--use-gpu-for-pooling-score cannot be used with api_server_count > 1 now"
|
||||
)
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
@@ -246,8 +252,12 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
|
||||
api_server_manager: APIServerProcessManager | None = None
|
||||
|
||||
from vllm.v1.engine.utils import get_engine_zmq_addresses
|
||||
|
||||
addresses = get_engine_zmq_addresses(vllm_config, num_api_servers)
|
||||
|
||||
with launch_core_engines(
|
||||
vllm_config, executor_class, log_stats, num_api_servers
|
||||
vllm_config, executor_class, log_stats, addresses, num_api_servers
|
||||
) as (local_engine_manager, coordinator, addresses):
|
||||
# Construct common args for the APIServerProcessManager up-front.
|
||||
api_server_manager_kwargs = dict(
|
||||
|
||||
Reference in New Issue
Block a user