[CI] cleanup single/multi-card test (#5623)

1. speed up e2e light test. 2. create `2-cards` and `4-cards` folder in multicard 3. move ops to nightly 4. run test in Alphabetical Order - vLLM version: v0.13.0 - vLLM main: 8be6432bda Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-01-07 14:13:34 +08:00
parent 1afbc01ed4
commit 6f7a81cd9f
30 changed files with 114 additions and 117 deletions
--- a/tests/e2e/multicard/2-cards/test_expert_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_expert_parallel.py
@@ -0,0 +1,34 @@
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+
+@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
+def test_deepseek_correctness_ep(model_name):
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    max_tokens = 5
+
+    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
+    with VllmRunner(model_name,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    tensor_parallel_size=2) as vllm_model:
+        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    enable_expert_parallel=True) as vllm_model:
+        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=ep_output,
+        outputs_1_lst=tp_output,
+        name_0="ep_output",
+        name_1="tp_output",
+    )