xc-llm-ascend/tests/e2e/multicard/2-cards/test_qwen3_performance.py

# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from typing import Any

import openai
import pytest
from vllm.utils.network_utils import get_open_port

from tests.e2e.conftest import RemoteOpenAIServer
from tools.vllm_bench import run_vllm_bench_case

MODELS = [
    "Qwen/Qwen3-8B",
]

prompts = [
    "San Francisco is a",
]

api_keyword_args = {
    "max_tokens": 10,
}

vllm_bench_cases = {
    "dataset-name": "random",
    "num_prompts": 500,
    "request_rate": 20,
    "random_input_len": 128,
    "max_concurrency": 40,
    "random_output_len": 100,
    "temperature": 0.0,
}

# NOTE: Any changes for the baseline throughput should be approved by team members.
# The origin baseline: 1600.0. For some uncertain reasons, the throughput is decreased to 1514.0
baseline_throughput = 1514.0  # baseline throughput for Qwen3-8B, measured with num_prompts=500


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.asyncio
async def test_models(model: str) -> None:
    port = get_open_port()
    env_dict = {
        "TASK_QUEUE_ENABLE": "1",
        "HCCL_OP_EXPANSION_MODE": "AIV",
    }
    server_args = [
        "--async-scheduling",
        "--distributed-executor-backend",
        "mp",
        "--tensor-parallel-size",
        "1",
        "--port",
        str(port),
        "--max-model-len",
        "5500",
        "--max-num-batched-tokens",
        "40960",
        "--compilation-config",
        '{"cudagraph_mode": "FULL_DECODE_ONLY"}',
        "--additional-config",
        '{"pa_shape_list":[48,64,72,80],"weight_prefetch_config":{"enabled":true}}',
        "--block-size",
        "128",
        "--trust-remote-code",
        "--gpu-memory-utilization",
        "0.9",
    ]

    request_keyword_args: dict[str, Any] = {
        **api_keyword_args,
    }
    with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:
        client = server.get_async_client()
        batch = await client.completions.create(
            model=model,
            prompt=prompts,
            **request_keyword_args,
        )
        choices: list[openai.types.CompletionChoice] = batch.choices
        assert choices[0].text, "empty response"
        # vllm bench test
        run_vllm_bench_case(model, port, vllm_bench_cases, baseline_throughput)
[Tests] Add qwen3-8b nightly test (#5597) ### What this PR does / why we need it? Add qwen3-8b nightly test - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-07 18:42:05 +08:00			`# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.`
			`# Copyright 2023 The vLLM team.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`# This file is a part of the vllm-ascend project.`
			`#`
			`from typing import Any`

			`import openai`
			`import pytest`
			`from vllm.utils.network_utils import get_open_port`

			`from tests.e2e.conftest import RemoteOpenAIServer`
			`from tools.vllm_bench import run_vllm_bench_case`

			`MODELS = [`
			`"Qwen/Qwen3-8B",`
			`]`

			`prompts = [`
			`"San Francisco is a",`
			`]`

			`api_keyword_args = {`
			`"max_tokens": 10,`
			`}`

			`vllm_bench_cases = {`
			`"dataset-name": "random",`
[Tests] move qwen3 performance test from nightly to e2e (#5980) ### What this PR does / why we need it? Move the qwen3 performance test from nightly to e2e to intercept performance degradation. - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2c24bc6996cb165fce92f780b388a5e39b3f4060 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-20 17:08:43 +08:00			`"num_prompts": 500,`
[Tests] Add qwen3-8b nightly test (#5597) ### What this PR does / why we need it? Add qwen3-8b nightly test - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-07 18:42:05 +08:00			`"request_rate": 20,`
			`"random_input_len": 128,`
			`"max_concurrency": 40,`
			`"random_output_len": 100,`
[Main2Main] Upgrade vllm commit to 0123 (#6169) ### What this PR does / why we need it? 1. ✅ Upgrade vllm commit to: 0115 (8471b27df97c3eb79f891802fc0e858f8f7ac6a0) Modify import paths due to the refactors： https://github.com/vllm-project/vllm/pull/32245 https://github.com/vllm-project/vllm/pull/32060 Test result: https://github.com/vllm-project/vllm-ascend/actions/runs/21034239336/job/60490156965?pr=5913 2. ✅Upgrade vllm commit to: 0119 (9a1f16da1e423ede2c2f52a9850cbfbb39cefe96) Fix `WorkerProc.__init__() missing 1 required positional argument: 'is_driver_worker'` due to https://github.com/vllm-project/vllm/pull/28506 Test result: https://github.com/vllm-project/vllm-ascend/actions/runs/21156263050/job/60841668755?5569 3. ✅Upgrade vllm commit to: 0120(148117ea2e689cd43df4be6892671a17cdae5833) 1. Add `skip_compiled` param in `set_forward_context` due to https://github.com/vllm-project/vllm/pull/30385 2. Modify `tests/ut/spec_decode/test_eagle_proposer.py` due to https://github.com/vllm-project/vllm/pull/24322 change `self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens + max_batch_size` 3. Modify UT import paths due to the refactors：https://github.com/vllm-project/vllm/pull/32060 Test result: https://github.com/vllm-project/vllm-ascend/actions/runs/21204851770/job/60999046946 4. ✅Upgrade vllm commit to: 0121(f23fb5a7c1b61350c5c40ca1115d3bf8cf2b8cc9) 1. vLLM switched `uses_mrope` from target to draft model config, making `positions`/`mrope_positions` mutually exclusive, breaking vllm-ascend's direct self.positions access and tests missing `draft_model_config.uses_mrope`. https://github.com/vllm-project/vllm/pull/32048 2. Moved bs_to_padded_graph_size from CompilationConfig to CudagraphDispatcher due to the refactor https://github.com/vllm-project/vllm/pull/30143 3. Remove unused `maybe_setup_kv_connector` due to https://github.com/vllm-project/vllm/pull/32077 Test result: https://github.com/vllm-project/vllm-ascend/actions/runs/21217728738/job/61043738834 6. ✅Upgrade vllm commit to: 0122(8ebf271bb6d1e7e9b1a55be73d755ef1a57dbbe5) Updating FusedMoEParallelConfig (added enable_eplb) and FusedMoEConfig due to https://github.com/vllm-project/vllm/pull/32414 Test result: https://github.com/vllm-project/vllm-ascend/actions/runs/21249922546/job/61148613054 8. ✅Upgrade vllm commit to: 0123(dc917cceb877dfd13f98c538c4c96158047d98bd) Setting temperature=0.0 due to the removal of the default temperature value in https://github.com/vllm-project/vllm/pull/32723 Test result: https://github.com/vllm-project/vllm-ascend/actions/runs/21280796875 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.0 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: wjunLu <wjunlu217@gmail.com> Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com> Co-authored-by: wjunLu <wjunlu217@gmail.com> 2026-01-27 08:44:36 +08:00			`"temperature": 0.0,`
[Tests] Add qwen3-8b nightly test (#5597) ### What this PR does / why we need it? Add qwen3-8b nightly test - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-07 18:42:05 +08:00			`}`

[CI] Decrease Qwen3 dense model output throughput baseline to make ci happy (#6233) ### What this PR does / why we need it? As https://github.com/vllm-project/vllm-ascend/actions/runs/21327913593/job/61388195448 shows, I encountered two CI failures., The results consistently pointed to the reduced outcome 1600 -> 1514 - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/d68209402ddab3f54a09bc1f4de9a9495a283b60 --------- Signed-off-by: wangli <wangli858794774@gmail.com> 2026-01-26 09:04:13 +08:00			`# NOTE: Any changes for the baseline throughput should be approved by team members.`
			`# The origin baseline: 1600.0. For some uncertain reasons, the throughput is decreased to 1514.0`
			`baseline_throughput = 1514.0 # baseline throughput for Qwen3-8B, measured with num_prompts=500`
[Tests] Add qwen3-8b nightly test (#5597) ### What this PR does / why we need it? Add qwen3-8b nightly test - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-07 18:42:05 +08:00

			`@pytest.mark.parametrize("model", MODELS)`
			`@pytest.mark.asyncio`
			`async def test_models(model: str) -> None:`
			`port = get_open_port()`
			`env_dict = {`
			`"TASK_QUEUE_ENABLE": "1",`
			`"HCCL_OP_EXPANSION_MODE": "AIV",`
			`}`
			`server_args = [`
			`"--async-scheduling",`
			`"--distributed-executor-backend",`
			`"mp",`
			`"--tensor-parallel-size",`
			`"1",`
			`"--port",`
			`str(port),`
			`"--max-model-len",`
			`"5500",`
			`"--max-num-batched-tokens",`
			`"40960",`
			`"--compilation-config",`
			`'{"cudagraph_mode": "FULL_DECODE_ONLY"}',`
			`"--additional-config",`
[Refactor] MLP weight prefetch to consistency with MoE Model's prefetching in terms of code and usage (#6442) ### What this PR does / why we need it? Refactor MLP weight prefetch to consistency with MoE Model's prefetching in terms of code and usage. Environments VLLM_ASCEND_ENABLE_PREFETCH_MLP, VLLM_ASCEND_MLP_DOWN_PREFETCH_SIZE and VLLM_ASCEND_MLP_GATE_UP_PREFETCH_SIZE is removed, usage as following: --additional-config '{"weight_prefetch_config": { "enabled": true, "prefetch_ratio": {"mlp": { "gate_up": 1.0, "down": 1.0} }}}' ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd --------- Signed-off-by: leo-pony <nengjunma@outlook.com> 2026-02-04 09:08:18 +08:00			`'{"pa_shape_list":[48,64,72,80],"weight_prefetch_config":{"enabled":true}}',`
[Tests] Add qwen3-8b nightly test (#5597) ### What this PR does / why we need it? Add qwen3-8b nightly test - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-07 18:42:05 +08:00			`"--block-size",`
			`"128",`
			`"--trust-remote-code",`
			`"--gpu-memory-utilization",`
			`"0.9",`
			`]`

			`request_keyword_args: dict[str, Any] = {`
			`**api_keyword_args,`
			`}`
[Lint]Style: Convert `test/` to ruff format(Batch #1) (#6738) ### What this PR does / why we need it? Scope of Changes: \| File Path \| \| :--- \| \| `tests/e2e/310p/multicard/test_vl_model_multicard.py` \| \| `tests/e2e/310p/singlecard/test_vl_model_singlecard.py` \| \| `tests/e2e/310p/test_utils.py` \| \| `tests/e2e/conftest.py` \| \| `tests/e2e/model_utils.py` \| \| `tests/e2e/models/conftest.py` \| \| `tests/e2e/models/test_lm_eval_correctness.py` \| \| `tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py` \| \| `tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py` \| \| `tests/e2e/multicard/2-cards/test_data_parallel.py` \| \| `tests/e2e/multicard/2-cards/test_disaggregated_encoder.py` \| \| `tests/e2e/multicard/2-cards/test_expert_parallel.py` \| \| `tests/e2e/multicard/2-cards/test_external_launcher.py` \| \| `tests/e2e/multicard/2-cards/test_full_graph_mode.py` \| \| `tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py` \| \| `tests/e2e/multicard/2-cards/test_offline_inference_distributed.py` \| \| `tests/e2e/multicard/2-cards/test_offline_weight_load.py` \| \| `tests/e2e/multicard/2-cards/test_pipeline_parallel.py` \| \| `tests/e2e/multicard/2-cards/test_prefix_caching.py` \| \| `tests/e2e/multicard/2-cards/test_quantization.py` \| \| `tests/e2e/multicard/2-cards/test_qwen3_moe.py` \| \| `tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py` \| \| `tests/e2e/multicard/2-cards/test_qwen3_performance.py` \| \| `tests/e2e/multicard/2-cards/test_shared_expert_dp.py` \| \| `tests/e2e/multicard/2-cards/test_single_request_aclgraph.py` \| \| `tests/e2e/multicard/2-cards/test_sp_pass.py` \| ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007 Signed-off-by: MrZ20 <2609716663@qq.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2026-03-10 09:52:50 +08:00			`with RemoteOpenAIServer(model, server_args, server_port=port, env_dict=env_dict, auto_port=False) as server:`
[Tests] Add qwen3-8b nightly test (#5597) ### What this PR does / why we need it? Add qwen3-8b nightly test - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/7157596103666ee7ccb7008acee8bff8a8ff1731 --------- Signed-off-by: wxsIcey <1790571317@qq.com> 2026-01-07 18:42:05 +08:00			`client = server.get_async_client()`
			`batch = await client.completions.create(`
			`model=model,`
			`prompt=prompts,`
			`**request_keyword_args,`
			`)`
			`choices: list[openai.types.CompletionChoice] = batch.choices`
			`assert choices[0].text, "empty response"`
			`# vllm bench test`
			`run_vllm_bench_case(model, port, vllm_bench_cases, baseline_throughput)`