Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/entrypoints/cli/init.py
+++ b/vllm/entrypoints/cli/init.py
@@ -1,12 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
+from vllm.entrypoints.cli.benchmark.mm_processor import (
+    BenchmarkMMProcessorSubcommand,
+)
+from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
+from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand
+from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand
+from vllm.entrypoints.cli.benchmark.throughput import BenchmarkThroughputSubcommand

-# Keep this package init import-free.
-#
-# The `vllm` console script imports `vllm.entrypoints.cli.main`, which causes
-# Python to import this package before loading the `main` submodule.
-# Eagerly importing benchmark subcommands here makes every `vllm serve ...`
-# startup depend on optional benchmark-only modules.
-#
-# Benchmark subcommands are loaded on demand in
-# `vllm.entrypoints.cli.benchmark.main`.
+__all__: list[str] = [
+    "BenchmarkLatencySubcommand",
+    "BenchmarkMMProcessorSubcommand",
+    "BenchmarkServingSubcommand",
+    "BenchmarkStartupSubcommand",
+    "BenchmarkSweepSubcommand",
+    "BenchmarkThroughputSubcommand",
+]
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
-import importlib
-import logging
 import typing

 from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
@@ -15,30 +13,6 @@ if typing.TYPE_CHECKING:
 else:
    FlexibleArgumentParser = argparse.ArgumentParser

-logger = logging.getLogger(__name__)
-
-
-def _load_benchmark_subcommands() -> None:
-    modules = [
-        "vllm.entrypoints.cli.benchmark.latency",
-        "vllm.entrypoints.cli.benchmark.mm_processor",
-        "vllm.entrypoints.cli.benchmark.serve",
-        "vllm.entrypoints.cli.benchmark.startup",
-        "vllm.entrypoints.cli.benchmark.sweep",
-        "vllm.entrypoints.cli.benchmark.throughput",
-    ]
-
-    for module_name in modules:
-        try:
-            importlib.import_module(module_name)
-        except ModuleNotFoundError as e:
-            logger.warning(
-                "Skipping benchmark subcommand module %s because an optional "
-                "dependency could not be imported: %r",
-                module_name,
-                e,
-            )
-

 class BenchmarkSubcommand(CLISubcommand):
    """The `bench` subcommand for the vLLM CLI."""
@@ -64,8 +38,6 @@ class BenchmarkSubcommand(CLISubcommand):
        )
        bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")

-        _load_benchmark_subcommands()
-
        for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
            cmd_subparser = bench_subparsers.add_parser(
                cmd_cls.name,
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -220,6 +220,12 @@ def run_multi_api_server(args: argparse.Namespace):
    num_api_servers: int = args.api_server_count
    assert num_api_servers > 0

+    if num_api_servers > 1 and getattr(args, "use_gpu_for_pooling_score", False):
+        # TODO(wentao): remove this once well tested
+        raise ValueError(
+            "--use-gpu-for-pooling-score cannot be used with api_server_count > 1 now"
+        )
+
    if num_api_servers > 1:
        setup_multiprocess_prometheus()

@@ -246,8 +252,12 @@ def run_multi_api_server(args: argparse.Namespace):

    api_server_manager: APIServerProcessManager | None = None

+    from vllm.v1.engine.utils import get_engine_zmq_addresses
+
+    addresses = get_engine_zmq_addresses(vllm_config, num_api_servers)
+
    with launch_core_engines(
-        vllm_config, executor_class, log_stats, num_api_servers
+        vllm_config, executor_class, log_stats, addresses, num_api_servers
    ) as (local_engine_manager, coordinator, addresses):
        # Construct common args for the APIServerProcessManager up-front.
        api_server_manager_kwargs = dict(