xc-llm-ascend/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py

import os
from unittest.mock import patch

from tests.e2e.conftest import VllmRunner
from vllm import SamplingParams
from vllm.sampling_params import RequestOutputKind


@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
def test_qwen3_moe_routing_replay():
    prompts = [
        "Hello, please introduce yourself.",
    ]
    with VllmRunner(
            "Qwen/Qwen3-30B-A3B",
            tensor_parallel_size=2,
            enable_expert_parallel=True,
            cudagraph_capture_sizes=[1, 2, 4, 8],
            distributed_executor_backend="mp",
            enable_return_routed_experts=True,
    ) as vllm_model:
        sampling_params = SamplingParams(
            max_tokens=5,
            temperature=0.8,
            top_p=0.95,
            output_kind=RequestOutputKind.FINAL_ONLY
        )
        inputs = vllm_model.get_inputs(prompts=prompts)
        outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
        assert outputs[0].finished
        assert len(outputs[0].outputs[0].text) > 0
        assert outputs[0].outputs[0].routed_experts.size > 0
[Feat] Support routing replay (#6696) ### What this PR does / why we need it? [Feat] Support routing replay same as https://github.com/vllm-project/vllm-ascend/pull/6666 resubmit because of DOC failure ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007 --------- Signed-off-by: liyongwen <1310439159@qq.com> Signed-off-by: Li-Yongwen <63399187+Li-Yongwen@users.noreply.github.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com> 2026-02-26 10:22:47 +08:00			`import os`
			`from unittest.mock import patch`

			`from tests.e2e.conftest import VllmRunner`
			`from vllm import SamplingParams`
			`from vllm.sampling_params import RequestOutputKind`


			`@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})`
			`def test_qwen3_moe_routing_replay():`
			`prompts = [`
			`"Hello, please introduce yourself.",`
			`]`
			`with VllmRunner(`
			`"Qwen/Qwen3-30B-A3B",`
			`tensor_parallel_size=2,`
			`enable_expert_parallel=True,`
			`cudagraph_capture_sizes=[1, 2, 4, 8],`
			`distributed_executor_backend="mp",`
			`enable_return_routed_experts=True,`
			`) as vllm_model:`
			`sampling_params = SamplingParams(`
			`max_tokens=5,`
			`temperature=0.8,`
			`top_p=0.95,`
			`output_kind=RequestOutputKind.FINAL_ONLY`
			`)`
			`inputs = vllm_model.get_inputs(prompts=prompts)`
			`outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)`
			`assert outputs[0].finished`
			`assert len(outputs[0].outputs[0].text) > 0`
			`assert outputs[0].outputs[0].routed_experts.size > 0`