From 382c29f3e1a3201c5bedd588f50fbe55dad2d919 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Sun, 14 Sep 2025 12:20:25 +0800 Subject: [PATCH] [BugFix] Fix world size bug in model_runner (#2915) - Fix world size bug in model_runner to make sure ep>16 runs with MC2 - enable e2e test for vl Co-Authored-By: whx-sjtu <2952154980@qq.com> Co-Authored-By: Icey <1790571317@qq.com> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/3e903b6cb4292ca1425a37cb809c1e3cddfdadcb Signed-off-by: wangxiyuan --- tests/e2e/singlecard/test_vlm.py | 17 +++++++++++------ tests/ut/worker/test_model_runner_v1.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py index 5fe27f6..fb01abb 100644 --- a/tests/e2e/singlecard/test_vlm.py +++ b/tests/e2e/singlecard/test_vlm.py @@ -22,17 +22,16 @@ Run `pytest tests/test_offline_inference.py`. """ import os -import pytest from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from tests.e2e.conftest import VllmRunner +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" -@pytest.mark.skip(reason="fix me") def test_multimodal_vl(prompt_template): image = ImageAsset("cherry_blossom") \ .pil_image.convert("RGB") @@ -52,9 +51,12 @@ def test_multimodal_vl(prompt_template): "fps": 1, }, enforce_eager=True) as vllm_model: - vllm_model.generate_greedy(prompts=prompts, - images=images, - max_tokens=64) + outputs = vllm_model.generate_greedy(prompts=prompts, + images=images, + max_tokens=64) + assert len(outputs) == len(prompts) + for _, output_str in outputs: + assert output_str, "Generated output should not be empty." def test_multimodal_audio(): @@ -86,4 +88,7 @@ def test_multimodal_audio(): dtype="bfloat16", limit_mm_per_prompt={"audio": 2}, gpu_memory_utilization=0.9) as runner: - runner.generate(inputs, sampling_params=sampling_params) + outputs = runner.generate(inputs, sampling_params=sampling_params) + + assert outputs is not None, "Generated outputs should not be None." + assert len(outputs) > 0, "Generated outputs should not be empty." diff --git a/tests/ut/worker/test_model_runner_v1.py b/tests/ut/worker/test_model_runner_v1.py index eb83d30..5f7ad90 100644 --- a/tests/ut/worker/test_model_runner_v1.py +++ b/tests/ut/worker/test_model_runner_v1.py @@ -57,7 +57,7 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel, mock_runner = MagicMock(spec=NPUModelRunner) mock_runner.parallel_config = MagicMock() mock_runner.parallel_config.enable_expert_parallel = enable_expert_parallel - mock_runner.parallel_config.world_size = world_size + mock_runner.parallel_config.world_size_across_dp = world_size mock_runner.mc2_tokens_capacity = mc2_tokens_capacity # Patch the helper functions diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ab8f593..b4261fa 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1539,7 +1539,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): if not self.parallel_config.enable_expert_parallel: moe_comm_method = "allgather" elif soc_version in {AscendSocVersion.A2}: - if num_tokens <= self.mc2_tokens_capacity and self.parallel_config.world_size >= 16: + if num_tokens <= self.mc2_tokens_capacity and self.parallel_config.world_size_across_dp >= 16: moe_comm_method = "mc2" else: moe_comm_method = "allgather"