[Feat] Support routing replay (#6696)

### What this PR does / why we need it? [Feat] Support routing replay same as https://github.com/vllm-project/vllm-ascend/pull/6666 resubmit because of DOC failure ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 9562912cea --------- Signed-off-by: liyongwen <1310439159@qq.com> Signed-off-by: Li-Yongwen <63399187+Li-Yongwen@users.noreply.github.com> Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-02-26 10:22:47 +08:00
parent a9cca0c5c4
commit 2870f7c8ad
7 changed files with 190 additions and 0 deletions
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
@@ -0,0 +1,32 @@
+import os
+from unittest.mock import patch
+
+from tests.e2e.conftest import VllmRunner
+from vllm import SamplingParams
+from vllm.sampling_params import RequestOutputKind
+
+
+@patch.dict(os.environ, {"OMP_NUM_THREADS": "1"})
+def test_qwen3_moe_routing_replay():
+    prompts = [
+        "Hello, please introduce yourself.",
+    ]
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            distributed_executor_backend="mp",
+            enable_return_routed_experts=True,
+    ) as vllm_model:
+        sampling_params = SamplingParams(
+            max_tokens=5,
+            temperature=0.8,
+            top_p=0.95,
+            output_kind=RequestOutputKind.FINAL_ONLY
+        )
+        inputs = vllm_model.get_inputs(prompts=prompts)
+        outputs = vllm_model.model.generate(prompts=inputs, sampling_params=sampling_params)
+        assert outputs[0].finished
+        assert len(outputs[0].outputs[0].text) > 0
+        assert outputs[0].outputs[0].routed_experts.size > 0