From cd1c69ee0bddc91dc73cf67d69379e798021a447 Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Tue, 9 Dec 2025 21:37:38 +0800 Subject: [PATCH] [Fix] Add extra warmup run count for MC2 on specific SoC version (#4843) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? We didn’t account for this earlier because we didn’t have A3 in CI, but now that we do, this test case needs a few extra tweaks — please take a look at `profile_run`. Signed-off-by: Yizhou Liu Co-authored-by: Mengqing Cao --- tests/e2e/multicard/test_aclgraph_capture_replay.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/e2e/multicard/test_aclgraph_capture_replay.py b/tests/e2e/multicard/test_aclgraph_capture_replay.py index ff4777a1..4375e825 100644 --- a/tests/e2e/multicard/test_aclgraph_capture_replay.py +++ b/tests/e2e/multicard/test_aclgraph_capture_replay.py @@ -25,6 +25,8 @@ import pytest import torch from vllm.utils.network_utils import get_open_port +from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type + MODELS = [ "Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8", @@ -212,6 +214,10 @@ def test_aclgraph_capture_replay_dp2( # Part A: Warmup runs (Profile run + 2 runs per captured graph) warmup_runs = 1 + (2 * max_batch_sizes) + soc_version = get_ascend_device_type() + if soc_version in {AscendDeviceType._910_93} and "DeepSeek" in model: + # An extra warmup run is needed for MC2 warmup here + warmup_runs += 1 # Part B: Alignment padding (Empty runs to hit the 32-step boundary) padding_runs = aligned_steps - total_steps