[CI] cleanup single/multi-card test (#5623)

1. speed up e2e light test. 2. create `2-cards` and `4-cards` folder in multicard 3. move ops to nightly 4. run test in Alphabetical Order - vLLM version: v0.13.0 - vLLM main: 8be6432bda Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-01-07 14:13:34 +08:00
parent 1afbc01ed4
commit 6f7a81cd9f
30 changed files with 114 additions and 117 deletions
--- a/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
+++ b/tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import gc
+import math
+import multiprocessing
+import os
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+import torch
+from vllm.utils.network_utils import get_open_port
+
+from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
+
+MODELS = [
+    # Offline data parallel mode will be not supported/useful for dense models
+    # "Qwen/Qwen3-0.6B",
+    "vllm-ascend/DeepSeek-V2-Lite-W8A8",
+]
+
+
+def _install_spies(counters: dict[str, Any]) -> contextlib.ExitStack:
+    """Installs thread-safe spies on NPU methods to track invocation counts."""
+    from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
+
+    def make_spy(cls, method_name, counter):
+        original = getattr(cls, method_name)
+
+        def spy(self, *args, **kwargs):
+            with counter.get_lock():
+                counter.value += 1
+            return original(self, *args, **kwargs)
+
+        return spy
+
+    stack = contextlib.ExitStack()
+    hooks = [
+        (torch.npu.NPUGraph, "replay", counters["replay"]),
+        (torch.npu.NPUGraph, "__init__", counters["capture"]),
+        (NPUModelRunner, "execute_model", counters["exec_model"]),
+        (NPUModelRunner, "_dummy_run", counters["dummy_run"]),
+    ]
+
+    for cls, method, counter in hooks:
+        stack.enter_context(
+            patch.object(cls, method, make_spy(cls, method, counter)))
+
+    return stack
+
+
+def _run_worker_process(
+    rank: int,
+    local_rank: int,
+    world_size: int,
+    master_ip: str,
+    master_port: int,
+    counters: dict[str, Any],
+    model_path: str,
+    max_tokens: int,
+):
+    """Main entry point for the worker process."""
+    os.environ.update({
+        "VLLM_DP_RANK": str(rank),
+        "VLLM_DP_RANK_LOCAL": str(local_rank),
+        "VLLM_DP_SIZE": str(world_size),
+        "VLLM_DP_MASTER_IP": master_ip,
+        "VLLM_DP_MASTER_PORT": str(master_port),
+    })
+
+    # Import vLLM only after environment setup
+    from vllm import LLM, SamplingParams
+    from vllm.distributed.parallel_state import (
+        destroy_distributed_environment, destroy_model_parallel)
+
+    # Apply hooks and run inference
+    with _install_spies(counters):
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+
+        # Simple data sharding
+        chunk_size = len(prompts) // world_size
+        start_idx = rank * chunk_size
+        end_idx = start_idx + chunk_size if rank < world_size - 1 else len(
+            prompts)
+        local_prompts = prompts[start_idx:end_idx]
+
+        llm = LLM(
+            model=model_path,
+            quantization="ascend" if "W8A8" in model_path else None,
+            enable_expert_parallel=True if "DeepSeek" in model_path else False,
+            trust_remote_code=True,
+            # vllm enables async scheduling by default, remove below when vllm >= 0.14.0
+            async_scheduling=False,
+        )
+
+        # Expose model config to the main test process
+        counters["hidden_layers"].value = (
+            llm.llm_engine.model_config.hf_text_config.num_hidden_layers)
+
+        llm.generate(local_prompts,
+                     SamplingParams(max_tokens=max_tokens, temperature=0.0))
+
+        # Explicit cleanup is mandatory in multi-process vLLM tests
+        del llm
+
+        destroy_model_parallel()
+        destroy_distributed_environment()
+
+        with contextlib.suppress(AssertionError):
+            torch.distributed.destroy_process_group()
+
+        gc.collect()
+        torch.npu.empty_cache()
+        torch.npu.reset_peak_memory_stats()
+
+
+# @patch.dict(os.environ, clear=["HCCL_OP_EXPANSION_MODE","VLLM_WORKER_MULTIPROC_METHOD"])
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [4, 36])
+@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
+def test_models_aclgraph_capture_replay_metrics_dp2(
+    model: str,
+    max_tokens: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    # Counter doesn't work in default "spawn" mode
+    monkeypatch.delenv("VLLM_WORKER_MULTIPROC_METHOD", raising=False)
+
+    # Shared counters for cross-process assertion
+    counters = {
+        "replay": multiprocessing.Value("i", 0),
+        "capture": multiprocessing.Value("i", 0),
+        "exec_model": multiprocessing.Value("i", 0),
+        "dummy_run": multiprocessing.Value("i", 0),
+        "hidden_layers": multiprocessing.Value("i", -1),
+    }
+
+    dp_size = 2
+    port = get_open_port()
+
+    # Launch workers
+    workers = []
+    for rank in range(dp_size):
+        p = multiprocessing.Process(
+            target=_run_worker_process,
+            args=(rank, rank, dp_size, "127.0.0.1", port, counters, model,
+                  max_tokens),
+        )
+        p.start()
+        workers.append(p)
+
+    # Supervision loop
+    for p in workers:
+        p.join(timeout=900)
+        if p.exitcode != 0:
+            for k in workers:
+                if k.is_alive():
+                    k.kill()
+            raise RuntimeError(
+                f"Worker {p.pid} failed with exit code {p.exitcode}")
+
+    actual_capture = counters["capture"].value
+    actual_replay = counters["replay"].value
+    num_execute_model = counters["exec_model"].value
+    num_dummy_run = counters["dummy_run"].value
+    num_layers = counters["hidden_layers"].value
+
+    num_acl_graphs = num_layers + 1
+    num_comm_groups = sum(1 for s in [dp_size, 1]
+                          if s > 1)  # dp_size=2, tp_size=1
+
+    # Metric 1: Graph Capture (ACL Graph Construction)
+    # Ref: vllm_ascend.utils.update_aclgraph_sizes
+    max_batch_sizes = math.floor((1800 - num_comm_groups * 40) /
+                                 num_acl_graphs / (1 + num_comm_groups * 2))
+
+    expected_capture = max_batch_sizes * num_acl_graphs * dp_size
+    assert (
+        actual_capture == expected_capture
+    ), f"Capture count mismatch. Expected: {expected_capture}, Got: {actual_capture}"
+
+    # Metric 2: Model Execution (NPUModelRunner.execute_model)
+    # vLLM Step Breakdown:
+    # 1. First step (prefill, 1 prompt)
+    # 2. Generation steps (max_tokens)
+    # 3. Final step (likely EOS/idle step), no replay here
+    total_steps = max_tokens + 1  # this includes the 1 and 2 above
+    expected_exec_model = (total_steps + 1) * dp_size
+
+    assert (
+        num_execute_model == expected_exec_model
+    ), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
+
+    # Metric 3: Dummy Runs (Warmup & Alignment)
+    # vLLM synchronizes globally every 32 steps.
+    # Ref: vllm.v1.engine.core.DPEngineCoreProc._has_global_unfinished_reqs
+    aligned_steps = (total_steps + 31) // 32 * 32
+
+    # Part A: Warmup runs (Profile run + 2 runs per captured graph)
+    warmup_runs = 1 + (2 * max_batch_sizes)
+    soc_version = get_ascend_device_type()
+    if soc_version in {AscendDeviceType.A3} and "DeepSeek" in model:
+        # An extra warmup run is needed for MC2 warmup here
+        warmup_runs += 1
+
+    # Part B: Alignment padding (Empty runs to hit the 32-step boundary)
+    padding_runs = aligned_steps - total_steps
+
+    expected_dummy_run = (warmup_runs + padding_runs) * dp_size
+
+    assert (
+        num_dummy_run == expected_dummy_run
+    ), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
+
+    # Metric 4: Graph Replay (Inference Execution)
+    # Replays happen for every aligned step across all graphs.
+    expected_replay = num_acl_graphs * aligned_steps * dp_size
+
+    assert (
+        actual_replay == expected_replay
+    ), f"Replay count mismatch. Expected: {expected_replay}, Got: {actual_replay}"
--- a/tests/e2e/multicard/2-cards/test_data_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_data_parallel.py
@@ -0,0 +1,79 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/multicard/test_data_parallel.py`.
+"""
+
+import os
+import subprocess
+import sys
+from unittest.mock import patch
+
+import pytest
+
+MODELS = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_qwen3_inference_dp2(model, max_tokens):
+    moe_models = ["Qwen/Qwen3-30B-A3B", "vllm-ascend/Qwen3-30B-A3B-W8A8"]
+    quantization_models = ["vllm-ascend/Qwen3-30B-A3B-W8A8"]
+    script = "examples/offline_data_parallel.py"
+
+    env = os.environ.copy()
+
+    cmd = [
+        sys.executable,
+        script,
+        "--model",
+        model,
+        "--dp-size",
+        "2",
+        "--tp-size",
+        "1",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--trust-remote-code",
+    ]
+
+    if model in moe_models:
+        cmd.append("--enable-expert-parallel")
+    if model in quantization_models:
+        cmd.append("--quantization")
+        cmd.append("ascend")
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(cmd,
+                          env=env,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.STDOUT,
+                          timeout=600)
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "DP rank 0 needs to process" in output
+    assert "DP rank 1 needs to process" in output
+    assert "Generated text:" in output
+    assert proc.returncode == 0
--- a/tests/e2e/multicard/2-cards/test_expert_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_expert_parallel.py
@@ -0,0 +1,34 @@
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+
+@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
+def test_deepseek_correctness_ep(model_name):
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    max_tokens = 5
+
+    # FIXME: Really strange that chunked prefill might lead to different results, investigate further
+    with VllmRunner(model_name,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    tensor_parallel_size=2) as vllm_model:
+        tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    with VllmRunner(model_name,
+                    tensor_parallel_size=2,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    enable_expert_parallel=True) as vllm_model:
+        ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=ep_output,
+        outputs_1_lst=tp_output,
+        name_0="ep_output",
+        name_1="tp_output",
+    )
--- a/tests/e2e/multicard/2-cards/test_external_launcher.py
+++ b/tests/e2e/multicard/2-cards/test_external_launcher.py
@@ -0,0 +1,239 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/multicard/test_external_launcher.py`.
+"""
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+import torch_npu
+from modelscope import snapshot_download  # type: ignore
+
+MODELS = ["Qwen/Qwen3-0.6B"]
+MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
+DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "500"})
+def test_qwen3_external_launcher(model):
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    # TODO: Change to 2 when ci machine has 4 cards
+    cmd = [
+        sys.executable,
+        str(script),
+        "--model",
+        model,
+        "--tp-size",
+        "1",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--proc-per-node",
+        "2",
+        "--trust-remote-code",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=600,
+    )
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "TP RANKS: [0]" in output
+    assert "TP RANKS: [1]" in output
+    assert "Generated text:" in output
+    assert proc.returncode == 0
+
+
+@pytest.mark.parametrize("model", MOE_MODELS)
+def test_qwen3_moe_external_launcher_ep_tp2(model):
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    # TODO: Change to 2 when ci machine has 4 cards
+    cmd = [
+        sys.executable,
+        str(script), "--model", model, "--tp-size", "2", "--node-size", "1",
+        "--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code",
+        "--enable-expert-parallel"
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=600,
+    )
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "TP RANKS: [0, 1]" in output
+    assert "Generated text:" in output
+    assert proc.returncode == 0
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
+def test_qwen3_external_launcher_with_sleepmode():
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    # TODO: Change to 2 when ci machine has 4 cards
+    cmd = [
+        sys.executable,
+        str(script),
+        "--model",
+        "Qwen/Qwen3-8B",
+        "--tp-size",
+        "1",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--proc-per-node",
+        "2",
+        "--trust-remote-code",
+        "--enable-sleep-mode",
+        "--temperature",
+        "0",
+        "--model-weight-gib",
+        "16",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=300,
+    )
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "Generated text:" in output
+    assert "Sleep and wake up successfully!!" in output
+    assert proc.returncode == 0
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
+def test_qwen3_external_launcher_with_sleepmode_level2():
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    model_path = snapshot_download("Qwen/Qwen3-8B")
+    # TODO: Add moe model test
+    cmd = [
+        sys.executable,
+        str(script),
+        "--model",
+        model_path,
+        "--tp-size",
+        "1",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--proc-per-node",
+        "2",
+        "--trust-remote-code",
+        "--enable-sleep-mode",
+        "--temperature",
+        "0",
+        "--model-weight-gib",
+        "16",
+        "--sleep-mode-level",
+        "2",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=300,
+    )
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "Generated text:" in output
+    assert "Sleep and wake up successfully!!" in output
+    assert proc.returncode == 0
+
+
+@pytest.mark.skipif(
+    DEVICE_NAME != "Ascend910B",
+    reason="This test is only for Ascend910B devices.",
+)
+@pytest.mark.parametrize("model", MODELS)
+@patch.dict(os.environ, {
+    "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1",
+    "HCCL_BUFFSIZE": "500"
+})
+def test_qwen3_external_launcher_with_matmul_allreduce(model):
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    cmd = [
+        sys.executable,
+        str(script),
+        "--model",
+        model,
+        "--trust-remote-code",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=600,
+    )
+
+    output = proc.stdout.decode(errors='ignore')
+    print(output)
+
+    assert "Generated text:" in output
+    assert proc.returncode == 0
--- a/tests/e2e/multicard/2-cards/test_full_graph_mode.py
+++ b/tests/e2e/multicard/2-cards/test_full_graph_mode.py
@@ -0,0 +1,114 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+import os
+
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+
+def test_qwen3_moe_full_decode_only_tp2():
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+    prompts = [
+        "Hello, my name is", "The president of the United States is",
+        "The capital of France is", "The future of AI is"
+    ]
+    model = "Qwen/Qwen3-30B-A3B"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    compilation_config={
+                        "cudagraph_mode": "FULL_DECODE_ONLY",
+                        "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
+                    }) as runner:
+        vllm_fullgraph_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
+
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            cudagraph_capture_sizes=[4, 8, 24, 48, 60],
+            tensor_parallel_size=2,
+    ) as runner:
+        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+
+    vllm_fullgraph_outputs_list = []
+    for output in vllm_fullgraph_outputs:
+        vllm_fullgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_fullgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_fullgraph_outputs",
+    )
+
+
+def test_qwen3_moe_full_graph_tp2():
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+    prompts = [
+        "Hello, my name is", "The president of the United States is",
+        "The capital of France is", "The future of AI is"
+    ]
+    model = "Qwen/Qwen3-30B-A3B"
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+    with VllmRunner(model,
+                    max_model_len=1024,
+                    tensor_parallel_size=2,
+                    compilation_config={
+                        "cudagraph_mode": "FULL",
+                        "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
+                    }) as runner:
+        vllm_fullgraph_outputs = runner.model.generate(prompts,
+                                                       sampling_params)
+
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            cudagraph_capture_sizes=[4, 8, 24, 48, 60],
+            tensor_parallel_size=2,
+    ) as runner:
+        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+
+    vllm_fullgraph_outputs_list = []
+    for output in vllm_fullgraph_outputs:
+        vllm_fullgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_fullgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_fullgraph_outputs",
+    )
--- a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
+++ b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
@@ -0,0 +1,25 @@
+import pytest
+from modelscope import snapshot_download  # type: ignore
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
+                                                  MODEL_PATH, do_sample)
+
+
+@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
+def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
+    with VllmRunner(
+            snapshot_download(MODEL_PATH),
+            enable_lora=True,
+            max_loras=4,
+            dtype="half",
+            max_model_len=1024,
+            max_num_seqs=16,
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
+
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -0,0 +1,214 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/test_offline_inference.py`.
+"""
+import os
+from unittest.mock import patch
+
+import pytest
+from modelscope import snapshot_download  # type: ignore
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+QWEN_DENSE_MODELS = [
+    "vllm-ascend/Qwen3-0.6B-W8A8",
+]
+
+QWEN_W4A8_MODELS = [
+    "vllm-ascend/Qwen3-1.7B-W4A8-V1",
+]
+
+DEEPSEEK_W4A8_MODELS = [
+    "vllm-ascend/DeepSeek-V3.1-W4A8-puring",
+]
+
+
+def test_deepseek_multistream_moe_tp2():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "half"
+    max_tokens = 5
+    with VllmRunner(
+            "vllm-ascend/DeepSeek-V3-Pruning",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            distributed_executor_backend="mp",
+            additional_config={
+                "enable_multistream_moe": True,
+                "refresh": True,
+            },
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("model", QWEN_W4A8_MODELS)
+def test_qwen3_w4a8_dynamic_tp2(model):
+    prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner(
+            snapshot_download(model),
+            max_model_len=8192,
+            dtype="auto",
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(prompts, max_tokens)
+
+
+def test_qwen3_moe_sp_tp2() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    distributed_executor_backend="mp",
+                    compilation_config={"pass_config": {
+                        "enable_sp": True
+                    }},
+                    enable_expert_parallel=True,
+                    enforce_eager=True) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
+def test_deepseek_w4a8_accuracy_tp2(model):
+    prompts = [
+        "Hello, my name is", "The president of the United States is",
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs"
+    ]
+    vllm_ds_w4a8_answers = [
+        '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
+    ]
+    sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
+    with VllmRunner(snapshot_download(model),
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    quantization="ascend",
+                    enable_expert_parallel=True) as vllm_model:
+        vllm_quant_outputs = vllm_model.model.generate(prompts,
+                                                       sampling_params)
+
+    vllm_quant_outputs_list = []
+    for output in vllm_quant_outputs:
+        vllm_quant_outputs_list.append(
+            ([output.outputs[0].index], output.outputs[0].text))
+    vllm_answer_list = []
+    vllm_answer_list = ([([0], answer) for answer in vllm_ds_w4a8_answers])
+
+    check_outputs_equal(outputs_0_lst=vllm_answer_list,
+                        outputs_1_lst=vllm_quant_outputs_list,
+                        name_0="vllm_quant_outputs",
+                        name_1="vllm_answer_outputs")
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
+@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
+def test_qwen3_moe_fc2_tp2() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    enforce_eager=True) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
+def test_deepseek_v2_lite_fc1_tp2() -> None:
+    example_prompts = [
+        "test" * 1001,
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+    with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    enforce_eager=True,
+                    quantization="ascend") as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+
+
+@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
+def test_qwen3_dense_fc1_tp2(model):
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download(model),
+            max_model_len=8192,
+            dtype="auto",
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
+def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download(model),
+            max_model_len=8192,
+            dtype="auto",
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/e2e/multicard/2-cards/test_offline_weight_load.py
+++ b/tests/e2e/multicard/2-cards/test_offline_weight_load.py
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Run `pytest tests/multicard/test_offline_load_weight.py`.
+"""
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+MODELS = ["Qwen/Qwen3-30B-A3B"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_NZ": "0"})
+def test_qwen3_offline_load_and_sleepmode_tp2(model):
+    script = Path(
+        __file__
+    ).parent.parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
+    env = os.environ.copy()
+    cmd = [
+        sys.executable,
+        str(script),
+        "--model",
+        model,
+        "--tp-size",
+        "2",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--proc-per-node",
+        "2",
+        "--trust-remote-code",
+        "--enable-sleep-mode",
+        "--temperature",
+        "0",
+        "--model-weight-gib",
+        "0.8",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        timeout=600,
+    )
+    output = proc.stdout.decode(errors='ignore')
+
+    print(output)
+
+    assert "Generated text:" in output
+    assert "Sleep and wake up successfully!!" in output
+    assert proc.returncode == 0
--- a/tests/e2e/multicard/2-cards/test_pipeline_parallel.py
+++ b/tests/e2e/multicard/2-cards/test_pipeline_parallel.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+
+MODELS = [
+    "Qwen/Qwen3-0.6B",
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
+]
+
+TENSOR_PARALLELS = [1]
+PIPELINE_PARALLELS = [2]
+DIST_EXECUTOR_BACKEND = ["mp", "ray"]
+
+prompts = [
+    "Hello, my name is",
+    "The future of AI is",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
+@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS)
+@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND)
+def test_models_pp2(model: str, tp_size: int, pp_size: int,
+                    distributed_executor_backend: str) -> None:
+    with VllmRunner(model,
+                    tensor_parallel_size=tp_size,
+                    pipeline_parallel_size=pp_size,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    distributed_executor_backend=distributed_executor_backend,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy(prompts, 64)
--- a/tests/e2e/multicard/2-cards/test_prefix_caching.py
+++ b/tests/e2e/multicard/2-cards/test_prefix_caching.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the with and without prefix caching."""
+
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+MODELS = [
+    # for MHA
+    "Qwen/Qwen3-8B",
+    # for MLA
+    "deepseek-ai/DeepSeek-V2-Lite-Chat"
+]
+
+# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
+LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
+|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
+| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
+| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
+| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
+| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
+| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
+| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
+| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
+| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
+| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
+| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
+| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
+| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
+| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
+| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
+| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
+| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
+| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
+| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
+| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
+| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
+| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
+| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
+| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
+| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
+| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
+| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
+| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
+| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
+| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
+| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
+"""
+
+INPUT_PROMPTS = [
+    LONG_PROMPT +
+    "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+    LONG_PROMPT +
+    "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [50])
+def test_models_prefix_cache_tp2(model: str, max_tokens: int) -> None:
+    with VllmRunner(model,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    gpu_memory_utilization=0.7) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    enable_prefix_caching=False,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    cudagraph_capture_sizes=[1, 2, 4, 8],
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="vllm_output",
+        name_1="prefix_cache_output",
+    )
--- a/tests/e2e/multicard/2-cards/test_quantization.py
+++ b/tests/e2e/multicard/2-cards/test_quantization.py
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+from modelscope import snapshot_download  # type: ignore
+
+from tests.e2e.conftest import VllmRunner
+
+
+def test_qwen2_5_w8a8_external_quantized_tp2():
+    example_prompts = [
+        "The president of the United States is",
+    ]
+    max_tokens = 5
+    with VllmRunner(
+            snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            max_model_len=4096,
+            gpu_memory_utilization=0.8,
+    ) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    golden_results = [
+        'The president of the United States is the head of state and',
+    ]
+
+    for i in range(len(vllm_output)):
+        assert golden_results[i] == vllm_output[i][1]
+        print(f"Generated text: {vllm_output[i][1]!r}")
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
@@ -0,0 +1,126 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+import json
+import os
+from unittest.mock import patch
+
+import openai
+import pytest
+from modelscope import snapshot_download  # type: ignore
+from vllm.utils.network_utils import get_open_port
+
+from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
+
+
+@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
+def test_qwen3_moe_distributed_mp_tp2_ep():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_qwen3_moe_w8a8_distributed_tp2():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner(
+            snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
+            max_model_len=8192,
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_qwen3_moe_distributed_aiv_tp2():
+    os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "auto"
+    max_tokens = 5
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            cudagraph_capture_sizes=[1, 2, 4, 8],
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+@pytest.mark.asyncio
+async def test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb():
+    model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
+    port = get_open_port()
+    server_args = [
+        "--max_model_len", "8192", "--tensor_parallel_size", "2",
+        "--enable_expert_parallel", "--quantization", "ascend", "--port",
+        str(port), "--enforce_eager"
+    ]
+    env_dict = {"HCCL_BUFFSIZE": "1024"}
+    with RemoteOpenAIServer(model,
+                            server_args,
+                            server_port=port,
+                            auto_port=False,
+                            env_dict=env_dict) as server:
+        client = server.get_async_client()
+        batch = await client.completions.create(model=model,
+                                                prompt="What is deeplearning?",
+                                                max_tokens=300,
+                                                temperature=0,
+                                                top_p=1.0,
+                                                n=1)
+        gt_choices: list[openai.types.CompletionChoice] = batch.choices
+
+    # dynamic eplb test
+    # Since pytest runs as a daemon, it conflicts with the dynamic eplb manager
+    # during initialization in offline mode, so the online mode is used instead.
+    env_dict.update({"DYNAMIC_EPLB": "true"})
+    additional_config = {
+        "dynamic_eplb": True,
+        "num_iterations_eplb_update": 100,
+        "num_wait_worker_iterations": 20
+    }
+    server_args.extend(["--additional-config", json.dumps(additional_config)])
+    with RemoteOpenAIServer(model,
+                            server_args,
+                            server_port=port,
+                            auto_port=False,
+                            env_dict=env_dict) as server:
+        client = server.get_async_client()
+        batch = await client.completions.create(model=model,
+                                                prompt="What is deeplearning?",
+                                                max_tokens=300,
+                                                temperature=0,
+                                                top_p=1.0,
+                                                n=1)
+        eplb_choices: list[openai.types.CompletionChoice] = batch.choices
+    assert gt_choices[0].text == eplb_choices[
+        0].text, f"{gt_choices[0].text=} \n {eplb_choices[0].text=}"
--- a/tests/e2e/multicard/2-cards/test_shared_expert_dp.py
+++ b/tests/e2e/multicard/2-cards/test_shared_expert_dp.py
@@ -0,0 +1,93 @@
+import os
+
+import pytest
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+MODELS = [
+    "deepseek-ai/DeepSeek-V2-Lite",
+]
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_deepseek_v2_lite_enable_shared_expert_dp_tp2(model: str) -> None:
+
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+
+    prompts = [
+        "Hello, my name is", "The capital of the United States is",
+        "The capital of France is", "The future of AI is"
+    ]
+    sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
+
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            enforce_eager=True,
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+    ) as runner:
+        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+
+    os.environ["VLLM_ASCEND_ENABLE_FLASHCOMM1"] = "1"
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            enforce_eager=True,
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            additional_config={
+                "enable_shared_expert_dp": True,
+            },
+    ) as runner:
+        shared_expert_dp_eager_outputs = runner.model.generate(
+            prompts, sampling_params)
+
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            tensor_parallel_size=2,
+            enable_expert_parallel=True,
+            compilation_config={
+                "cudagraph_capture_sizes": [1, 4, 8, 16],
+                "cudagraph_mode": "FULL_DECODE_ONLY",
+            },
+            additional_config={
+                "enable_shared_expert_dp": True,
+            },
+    ) as runner:
+        shared_expert_dp_aclgraph_outputs = runner.model.generate(
+            prompts, sampling_params)
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    shared_expert_dp_eager_outputs_list = []
+    for output in shared_expert_dp_eager_outputs:
+        shared_expert_dp_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    shared_expert_dp_aclgraph_outputs_list = []
+    for output in shared_expert_dp_aclgraph_outputs:
+        shared_expert_dp_aclgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=shared_expert_dp_eager_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="shared_expert_dp_eager_outputs",
+    )
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=shared_expert_dp_aclgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="shared_expert_dp_aclgraph_outputs",
+    )
--- a/tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
+++ b/tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+import asyncio
+from typing import Any
+
+import openai
+import pytest
+from vllm.utils.network_utils import get_open_port
+
+from tests.e2e.conftest import RemoteOpenAIServer
+
+MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
+
+DATA_PARALLELS = [2]
+
+prompts = [
+    "San Francisco is a",
+]
+
+api_keyword_args = {
+    "max_tokens": 10,
+}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
+async def test_models_single_request_aclgraph_dp2(model: str,
+                                                  dp_size: int) -> None:
+    port = get_open_port()
+    env_dict = {
+        "TASK_QUEUE_ENABLE": "1",
+        "HCCL_OP_EXPANSION_MODE": "AIV",
+    }
+    if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
+        server_args = [
+            "--no-enable-prefix-caching", "--tensor-parallel-size", "1",
+            "--data-parallel-size",
+            str(dp_size), "--quantization", "ascend", "--max-model-len",
+            "1024", "--port",
+            str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
+        ]
+    else:
+        server_args = [
+            "--no-enable-prefix-caching", "--tensor-parallel-size", "1",
+            "--data-parallel-size",
+            str(dp_size), "--port",
+            str(port), "--trust-remote-code", "--gpu-memory-utilization", "0.9"
+        ]
+    request_keyword_args: dict[str, Any] = {
+        **api_keyword_args,
+    }
+    with RemoteOpenAIServer(model,
+                            vllm_serve_args=server_args,
+                            server_port=port,
+                            env_dict=env_dict,
+                            auto_port=False) as server:
+        client = server.get_async_client()
+
+        try:
+            batch = await asyncio.wait_for(client.completions.create(
+                model=model,
+                prompt=prompts,
+                **request_keyword_args,
+            ),
+                                           timeout=10.0)
+        except asyncio.TimeoutError:
+            pytest.fail("Model did not return response within 10 seconds")
+
+        choices: list[openai.types.CompletionChoice] = batch.choices
+        assert choices[0].text, "empty response"