v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/tests/e2e/singlecard/init.py
+++ b/tests/e2e/singlecard/init.py
--- a/tests/e2e/singlecard/ops/init.py
+++ b/tests/e2e/singlecard/ops/init.py
--- a/tests/e2e/singlecard/ops/test_bgmv_expand.py
+++ b/tests/e2e/singlecard/ops/test_bgmv_expand.py
@@ -0,0 +1,46 @@
+import gc
+
+import torch
+
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+DEFAULT_ATOL = 1e-3
+DEFAULT_RTOL = 1e-3
+
+
+def bgmv_expand_cpu_impl(x: torch.Tensor, w: torch.Tensor,
+                         indices: torch.Tensor, y: torch.tensor,
+                         slice_offset: int, slice_size: int) -> torch.Tensor:
+    W = w[indices, :, :].transpose(-1, -2).to(torch.float32)
+    z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze()
+    y[:, slice_offset:slice_offset + slice_size] += z
+    return y
+
+
+@torch.inference_mode()
+def test_bgmv_expand():
+    B = 1
+    x = torch.randn([B, 16], dtype=torch.float)
+    w = torch.randn([64, 128, 16], dtype=torch.float16)
+    indices = torch.zeros([B], dtype=torch.int64)
+    y = torch.randn([B, 128 * 3], dtype=torch.float16)
+
+    x_npu = x.npu()
+    w_npu = w.npu()
+    indices_npu = indices.npu()
+    y_npu = y.npu()
+
+    y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
+    y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
+                                         128)
+
+    # Compare the results.
+    torch.testing.assert_close(y_out_npu.cpu(),
+                               y_out,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/ops/test_bgmv_shrink.py
+++ b/tests/e2e/singlecard/ops/test_bgmv_shrink.py
@@ -0,0 +1,45 @@
+import gc
+
+import torch
+
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+DEFAULT_ATOL = 1e-3
+DEFAULT_RTOL = 1e-3
+
+
+def bgmv_shrink_cpu_impl(x: torch.Tensor, w: torch.Tensor,
+                         indices: torch.Tensor, y: torch.tensor,
+                         scaling: float) -> torch.Tensor:
+    W = w[indices, :, :].transpose(-1, -2).to(torch.float32)
+    z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze()
+    y[:, :] += z * scaling
+    return y
+
+
+@torch.inference_mode()
+def test_bgmv_shrink():
+    B = 1
+    x = torch.randn([B, 128], dtype=torch.float16)
+    w = torch.randn([64, 16, 128], dtype=torch.float16)
+    indices = torch.zeros([B], dtype=torch.int64)
+    y = torch.zeros([B, 16])
+
+    x_npu = x.npu()
+    w_npu = w.npu()
+    indices_npu = indices.npu()
+    y_npu = y.npu()
+
+    y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
+    torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
+
+    # Compare the results.
+    torch.testing.assert_close(y_npu.cpu(),
+                               y,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/ops/test_fused_moe.py
+++ b/tests/e2e/singlecard/ops/test_fused_moe.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/kernels/test_moe.py
+"""Tests for the MOE layers.
+
+Run `pytest tests/ops/test_fused_moe.py`.
+"""
+
+import gc
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch_npu
+from vllm.model_executor.layers.activation import SiluAndMul
+
+from vllm_ascend.ops.layers.experts_selector import select_experts
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
+    TokenDispatcherWithAllGather
+
+NUM_EXPERTS = [8, 64]
+EP_SIZE = [1, 4]
+TOP_KS = [2, 6]
+DEVICE = ["npu"]
+
+
+def apply_mlp(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    group_list: torch.Tensor,
+    group_list_type: int = 1,
+) -> torch.Tensor:
+    w1 = w1.transpose(1, 2)
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w1],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+    )[0]
+
+    hidden_states = torch_npu.npu_swiglu(hidden_states)
+
+    w2 = w2.transpose(1, 2)
+    hidden_states = torch_npu.npu_grouped_matmul(
+        x=[hidden_states],
+        weight=[w2],
+        split_item=2,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+    )[0]
+
+    return hidden_states
+
+
+def torch_moe(a, w1, w2, topk_weights, topk_ids, topk, expert_map):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    topk_weights = topk_weights.view(-1)
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weights.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("device", DEVICE)
+def test_token_dispatcher_with_all_gather(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
+
+    score = torch.randn((m, e), device=device, dtype=dtype)
+    expert_map = None
+    local_e = e
+    w1_local = w1
+    w2_local = w2
+
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.arange(local_e * 0,
+                             local_e * (0 + 1),
+                             device=device,
+                             dtype=torch.int32)
+        expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
+        expert_map[e_ids] = torch.arange(local_e,
+                                         device=device,
+                                         dtype=torch.int32)
+        w1_local = w1[e_ids]
+        w2_local = w2[e_ids]
+
+    score = torch.softmax(score, dim=-1, dtype=dtype)
+    topk_weights, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.to(torch.int32)
+    row_idx = (torch.arange(
+        0,
+        m * topk,
+        device=device,
+        dtype=torch.int32,
+    ).view(topk, -1).permute(1, 0).contiguous())
+
+    dispatcher_kwargs = {
+        "num_experts": e,
+        "top_k": topk,
+        "num_local_experts": local_e,
+    }
+    dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
+
+    apply_router_weight_on_input = False
+    dispatch_output = dispatcher.token_dispatch(
+        hidden_states=a,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        row_idx=row_idx,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input)
+
+    sorted_hidden_states = dispatch_output["hidden_states"]
+    group_list = dispatch_output["group_list"]
+    group_list_type = dispatch_output.get("group_list_type", 1)
+
+    expert_output = apply_mlp(hidden_states=sorted_hidden_states,
+                              w1=w1_local,
+                              w2=w2_local,
+                              group_list=group_list,
+                              group_list_type=group_list_type)
+
+    combined_output = dispatcher.token_combine(hidden_states=expert_output,
+                                               bias=None)
+
+    torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk,
+                             expert_map)
+
+    torch.testing.assert_close(combined_output,
+                               torch_output,
+                               atol=4e-2,
+                               rtol=1)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
+
+
+@pytest.mark.parametrize("m", [1, 33, 64])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"])
+@pytest.mark.parametrize("use_grouped_topk", [True, False])
+@pytest.mark.parametrize("renormalize", [True, False])
+@pytest.mark.parametrize("with_e_correction", [True, False])
+@pytest.mark.parametrize("custom_routing", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("device", DEVICE)
+def test_select_experts(
+    m: int,
+    n: int,
+    e: int,
+    topk: int,
+    scoring_func: str,
+    use_grouped_topk: bool,
+    renormalize: bool,
+    with_e_correction: bool,
+    custom_routing: bool,
+    dtype: torch.dtype,
+    device: str,
+):
+    topk_group = 4 if use_grouped_topk else None
+    num_expert_group = e // 4 if use_grouped_topk else None
+
+    hidden_states = torch.randn(m, n, device=device, dtype=dtype)
+    router_logits = torch.randn(m, e, device=device, dtype=dtype)
+
+    e_score_correction_bias = (torch.randn(e, device=device, dtype=dtype)
+                               if with_e_correction else None)
+
+    custom_routing_function = None
+    if custom_routing:
+        custom_routing_function = MagicMock()
+        mock_weights = torch.randn(m, topk, device=device, dtype=dtype)
+        mock_ids = torch.randint(0,
+                                 e, (m, topk),
+                                 device=device,
+                                 dtype=torch.int32)
+        custom_routing_function.return_value = (mock_weights, mock_ids)
+
+    with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
+               ) as mock_native_grouped_topk:
+        mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
+            x)
+
+        topk_weights, topk_ids, row_idx = select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            top_k=topk,
+            use_grouped_topk=use_grouped_topk,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        if use_grouped_topk:
+            mock_native_grouped_topk.assert_called_once()
+        else:
+            mock_native_grouped_topk.assert_not_called()
+
+    assert topk_weights.shape == (m, topk)
+    assert topk_ids.shape == (m, topk)
+    assert topk_ids.dtype == torch.int32
+    assert row_idx.shape == (m, topk)
+
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
+
+
+@pytest.mark.parametrize("device", DEVICE)
+def test_select_experts_invalid_scoring_func(device: str):
+    with pytest.raises(ValueError,
+                       match="Unsupported scoring function: invalid"):
+        select_experts(hidden_states=torch.randn(1, 128, device=device),
+                       router_logits=torch.randn(1, 8, device=device),
+                       top_k=2,
+                       use_grouped_topk=False,
+                       renormalize=False,
+                       scoring_func="invalid")
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
+
+
+@pytest.mark.parametrize("device", DEVICE)
+def test_select_experts_missing_group_params(device: str):
+    with pytest.raises(AssertionError):
+        select_experts(hidden_states=torch.randn(1, 128, device=device),
+                       router_logits=torch.randn(1, 64, device=device),
+                       top_k=2,
+                       use_grouped_topk=True,
+                       renormalize=False,
+                       scoring_func="softmax")
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/ops/test_gating_top_k_softmax.py
+++ b/tests/e2e/singlecard/ops/test_gating_top_k_softmax.py
@@ -0,0 +1,37 @@
+import pytest
+import torch
+import torch_npu
+
+
+@pytest.mark.parametrize(
+    'B',
+    [1, 16, 64, 128, 32768],
+)
+@pytest.mark.parametrize(
+    'D',
+    [8, 16, 32, 64, 128],
+)
+@pytest.mark.parametrize(
+    'top_k',
+    [1, 2, 4, 8],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-3, 1e-3),
+    ],
+)
+def test_quant_fpx_linear(B: int, D: int, top_k: int, dtype, atol, rtol):
+    x = torch.rand((B, D), dtype=dtype).to("npu")
+    # finished = torch.randint(1, size=(B,), dtype=torch.bool).to("npu")
+    finished = None
+    y, expert_idx, row_idx = torch_npu.npu_moe_gating_top_k_softmax(x,
+                                                                    finished,
+                                                                    k=top_k)
+
+    topk_weights = x.softmax(dim=-1)
+    topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1)
+    topk_ids = topk_ids.to(torch.int32)
+    torch.allclose(y, topk_weights, atol=atol, rtol=rtol)
+    torch.allclose(expert_idx, topk_ids, atol=atol, rtol=rtol)
--- a/tests/e2e/singlecard/ops/test_moe_comm.py
+++ b/tests/e2e/singlecard/ops/test_moe_comm.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+
+import gc
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.config import (  # isort: skip
+    FusedMoEConfig, FusedMoEParallelConfig)
+
+from vllm_ascend.distributed.moe_comm_method import (  # isort: skip
+    AllGatherCommImpl, NativeAllGatherCommImpl)
+
+
+@pytest.mark.parametrize("num_tokens", [16, 128])
+@pytest.mark.parametrize("hidden_size", [64, 128])
+@pytest.mark.parametrize("global_num_experts", [8, 16])
+@pytest.mark.parametrize("num_local_experts", [4, 8])
+@pytest.mark.parametrize("top_k_num", [2, 4])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("ep_rank", [0, 1])
+@pytest.mark.parametrize("apply_a8_quantization", [False])
+def test_all_gather_comm_impl(
+    num_tokens,
+    hidden_size,
+    global_num_experts,
+    num_local_experts,
+    top_k_num,
+    dtype,
+    ep_rank,
+    apply_a8_quantization,
+    mocker,
+):
+    """
+    Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
+
+    This test compares the outputs of the NPU-optimized AllGatherCommImpl
+    with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
+    correctness across various configurations.
+    """
+    if top_k_num > global_num_experts:
+        pytest.skip("top_k_num cannot be greater than global_num_experts")
+    if num_local_experts > global_num_experts:
+        pytest.skip(
+            "num_local_experts cannot be greater than global_num_experts")
+
+    device = torch.device("npu")
+
+    # mock get_tensor_model_parallel_rank to return ep_rank
+    mocker.patch(
+        "vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
+        return_value=ep_rank,
+    )
+
+    # make moe config
+    parallel_config = SimpleNamespace(
+        enable_expert_parallel=num_local_experts < global_num_experts)
+    moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
+        tp_size_=max(2, global_num_experts // num_local_experts),
+        dp_size_=1,
+        vllm_parallel_config=parallel_config,
+    )
+
+    moe_config = FusedMoEConfig(
+        num_experts=global_num_experts,
+        experts_per_token=top_k_num,
+        hidden_dim=hidden_size,
+        num_local_experts=num_local_experts,
+        moe_parallel_config=moe_parallel_config,
+        in_dtype=dtype,
+        quant_config=None,  # No quantization in this test
+        max_num_tokens=num_tokens,
+    )
+
+    # Instantiate implementations
+    native_impl = NativeAllGatherCommImpl(moe_config)
+
+    all_gather_impl = AllGatherCommImpl(moe_config)
+
+    # --- Input Data ---
+    hidden_states = torch.randn(num_tokens,
+                                hidden_size,
+                                device=device,
+                                dtype=dtype)
+    topk_ids = torch.randint(0,
+                             global_num_experts, (num_tokens, top_k_num),
+                             device=device,
+                             dtype=torch.int32)
+    topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
+
+    num_experts = global_num_experts
+
+    expert_map = None
+    if num_local_experts < global_num_experts:
+        # Create a map where some experts are local and some are not
+        expert_map = torch.full((global_num_experts, ), -1, device=device)
+        expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
+                   num_local_experts] = torch.arange(num_local_experts,
+                                                     device=device)
+    num_experts = num_local_experts
+
+    # --- Run Native Implementation (Golden Reference) ---
+    native_hidden_states_out = hidden_states.clone()
+    (
+        native_permuted_hidden,
+        native_expert_tokens,
+        _,
+        _,
+    ) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
+                            num_experts, apply_a8_quantization)
+    # Simulate MLP output
+    native_mlp_output = torch.randn_like(native_permuted_hidden)
+    native_impl.unpermute(native_mlp_output, native_hidden_states_out)
+
+    # --- Run AllGather Implementation ---
+    all_gather_hidden_states_out = hidden_states.clone()
+    (
+        all_gather_permuted_hidden,
+        all_gather_expert_tokens,
+        _,
+        _,
+    ) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
+                                expert_map, num_experts, apply_a8_quantization)
+
+    # Use the same simulated MLP output for a fair comparison
+    all_gather_mlp_output = native_mlp_output.clone()
+
+    all_gather_impl.unpermute(all_gather_mlp_output,
+                              all_gather_hidden_states_out)
+
+    # --- Assertions ---
+    # Define tolerance based on dtype
+    atol = 1e-3 if dtype == torch.float16 else 1e-2
+    rtol = 1e-3 if dtype == torch.float16 else 1e-2
+
+    # 1. Compare expert_tokens from pre_process
+    assert torch.allclose(native_expert_tokens.to(
+        all_gather_expert_tokens.device),
+                          all_gather_expert_tokens,
+                          atol=atol,
+                          rtol=rtol), "Expert tokens do not match."
+
+    # 2. Compare permuted_hidden_states from pre_process
+    num_valid_tokens = native_expert_tokens.sum()
+    assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
+        all_gather_permuted_hidden.device),
+                          all_gather_permuted_hidden[:num_valid_tokens],
+                          atol=atol,
+                          rtol=rtol), "Permuted hidden states do not match."
+
+    # 3. Compare final hidden_states from post_process
+    assert torch.allclose(native_hidden_states_out.to(
+        all_gather_hidden_states_out.device),
+                          all_gather_hidden_states_out,
+                          atol=atol,
+                          rtol=rtol), "Final hidden states do not match."
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/ops/test_rotary_embedding.py
+++ b/tests/e2e/singlecard/ops/test_rotary_embedding.py
@@ -0,0 +1,351 @@
+# Copyright 2023 The vLLM team.
+
+# Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved.
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/tests/kernels/test_rotary_embedding.py
+
+import gc
+from typing import Optional, Tuple, Union
+
+import pytest
+import torch
+import torch.nn as nn
+
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+# Only Neox style true scenario is supported for now
+IS_NEOX_STYLE = [True]
+DTYPES = [torch.half]
+HEAD_SIZES = [64, 64, 96, 128, 256]
+ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
+NUM_HEADS = [17]  # Arbitrary values for testing
+BATCH_SIZES = [5]  # Arbitrary values for testing
+SEQ_LENS = [11, 4096]  # Arbitrary values for testing
+NUM_TOKENS = [10, 21]
+SEEDS = [0]
+DEVICES = [f"npu:{0}"]
+# Set tolerance to 1 for quant ops
+DEFAULT_ATOL = 1e-3
+DEFAULT_RTOL = 1e-3
+
+
+def _apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+# adapted from https://github.com/vllm-project/vllm/vllm/model_executor/layers/rotary_embedding.py
+class RotaryEmbedding(nn.Module):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        cache = cache.to(dtype)
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-native implementation of forward()."""
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+
+# test with leading dimension and merge seqlen and batch_size as num_tokens
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_rotary_embedding_quant_with_leading_dim(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    if rotary_dim is None:
+        rotary_dim = head_size
+
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                           is_neox_style, dtype)
+    rope = rope.to(dtype=dtype)
+    num_tokens = batch_size * seq_len
+    positions = torch.randint(0, max_position, (batch_size * seq_len, ))
+    qkv_tensor = torch.randn(num_tokens,
+                             num_heads * head_size * 3,
+                             dtype=dtype)
+    query, key, _ = qkv_tensor.split(
+        [num_heads * head_size, num_heads * head_size, num_heads * head_size],
+        dim=-1,
+    )
+
+    ref_query, ref_key = rope.forward_native(positions, query, key)
+    query, key = torch.ops._C.rotary_embedding(
+        positions,
+        query,
+        key,
+        rope.head_size,
+        rope.cos_sin_cache,
+        rope.is_neox_style,
+    )
+
+    # Compare the results.
+    torch.testing.assert_close(query.view(ref_query.size()),
+                               ref_query,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    torch.testing.assert_close(key.view(ref_key.size()),
+                               ref_key,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
+
+
+class ModelwithRotaryEmbedding(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.qkv_proj = nn.Linear(hidden_size, num_heads * head_size * 3)
+        self.rope = RotaryEmbedding(
+            head_size=head_size,
+            rotary_dim=rotary_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=base,
+            is_neox_style=is_neox_style,
+            dtype=dtype,
+        )
+        self.o_proj = nn.Linear(num_heads * head_size, hidden_size)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(3, dim=-1)
+        query, key = torch.ops._C.rotary_embedding(
+            positions,
+            q,
+            k,
+            self.rope.head_size,
+            self.rope.cos_sin_cache,
+            self.rope.is_neox_style,
+        )
+        query = query.view(q.shape)
+        key = key.view(k.shape)
+        o = self.o_proj(query)
+        return o
+
+
+# The first graph seems will have some accuracy issue when directly run pytest on the ops folder,
+# add a warmup graph replay for workaround
+ACL_GRPAH_FIRST_RUN = True
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("num_tokens", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_capture_rotary_embedding_in_aclgraph(
+    is_neox_style: bool,
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position_embeddings: int = 8192,
+    base: int = 10000,
+):
+    """Test if the rotary embedding can be captured in aclgraph."""
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    model = ModelwithRotaryEmbedding(
+        hidden_size=num_heads * head_size,
+        num_heads=num_heads,
+        head_size=head_size,
+        rotary_dim=rotary_dim,
+        max_position_embeddings=max_position_embeddings,
+        base=base,
+        is_neox_style=is_neox_style,
+        dtype=dtype,
+    )
+
+    def custom_op_checking_backend(gm: torch.fx.GraphModule, example_input):
+        # Validate if the rotary_embedding custom kernel is indeed inside the graph by
+        # string match
+        graph = str(gm.graph)
+        assert "_C.rotary_embedding" in graph
+        return gm
+
+    static_positions = torch.randint(0, max_position_embeddings,
+                                     (num_tokens, ))
+    static_hidden_states = torch.randn(num_tokens,
+                                       num_heads * head_size,
+                                       dtype=dtype,
+                                       device="npu")
+    compiled_model = torch.compile(model, backend=custom_op_checking_backend)
+    stream = torch.npu.Stream()
+    stream.wait_stream(torch.npu.current_stream())
+    with torch.npu.stream(stream):
+        # warmup the fx graph before capture
+        for i in range(3):
+            static_output = compiled_model(static_positions,
+                                           static_hidden_states,
+                                           offsets=None)
+    stream.wait_stream(torch.npu.current_stream())
+
+    aclgraph = torch.npu.NPUGraph()
+
+    with torch.npu.graph(aclgraph):
+        # Capture the model in aclgraph.
+        static_output = compiled_model(static_positions, static_hidden_states)
+    # Capture the model in aclgraph.
+    random_filled_positions = torch.randint(0,
+                                            max_position_embeddings,
+                                            (num_tokens, ),
+                                            device="npu")
+    random_filled_hidden_states = torch.randn(num_tokens,
+                                              num_heads * head_size,
+                                              dtype=dtype,
+                                              device="npu")
+    static_positions.copy_(random_filled_positions)
+    static_hidden_states.copy_(random_filled_hidden_states)
+
+    aclgraph.replay()
+    global ACL_GRPAH_FIRST_RUN
+    if ACL_GRPAH_FIRST_RUN:
+        ACL_GRPAH_FIRST_RUN = False
+        return
+    output_reference = model(static_positions, static_hidden_states)
+    torch.testing.assert_close(static_output,
+                               output_reference,
+                               atol=DEFAULT_ATOL,
+                               rtol=DEFAULT_RTOL)
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/ops/test_vocabparallelembedding.py
+++ b/tests/e2e/singlecard/ops/test_vocabparallelembedding.py
@@ -0,0 +1,98 @@
+import gc
+from typing import Tuple
+
+import pytest
+import torch
+import torch_npu  # noqa: F401
+
+import vllm_ascend.platform  # noqa: F401
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
+
+# Test parameters
+DTYPES = [torch.int32]
+#SHAPES = [(100,), (5, 20), (3, 4, 5)]  # Various tensor shapes
+#SHAPES = [(3, 4, 8), (3, 4, 5)]  # Various tensor shapes
+SHAPES = [(3, 4, 3)]
+DEVICES = [f"npu:{0}"]
+SEEDS = [0]
+
+
+def get_masked_input_and_mask_ref(
+        input_: torch.Tensor, org_vocab_start_index: int,
+        org_vocab_end_index: int, num_org_vocab_padding: int,
+        added_vocab_start_index: int,
+        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference implementation for verification"""
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (
+        input_ < org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index)
+    added_offset = added_vocab_start_index - (
+        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
+    valid_offset = (org_vocab_start_index *
+                    org_vocab_mask) + (added_offset * added_vocab_mask)
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    masked_input = vocab_mask * (input_ - valid_offset)
+    return masked_input, ~vocab_mask
+
+
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_get_masked_input_and_mask(
+    shape: Tuple[int, ...],
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+) -> None:
+    # Set random seed
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # Generate random input tensor
+    input_tensor = torch.randint(0, 1000, shape, dtype=dtype)
+
+    # Test parameters
+    test_case = {
+        "org_start": 100,
+        "org_end": 200,
+        "padding": 0,
+        "added_start": 300,
+        "added_end": 400,
+    }
+
+    # Get reference result
+    ref_masked_input, ref_mask = get_masked_input_and_mask_ref(
+        input_tensor, test_case["org_start"], test_case["org_end"],
+        test_case["padding"], test_case["added_start"], test_case["added_end"])
+
+    # Get custom op result
+    print("input_tensor:", input_tensor)
+    custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
+        input_tensor, test_case["org_start"], test_case["org_end"],
+        test_case["padding"], test_case["added_start"], test_case["added_end"])
+
+    ref_masked_input = ref_masked_input.to(dtype)
+    print("custom_masked_input:", custom_masked_input)
+    print("ref_masked_input:", ref_masked_input)
+    print("custom_mask:", custom_mask)
+    print("ref_mask:", ref_mask)
+    # Compare results
+    torch.testing.assert_close(
+        custom_masked_input,
+        ref_masked_input,
+        rtol=1e-5,
+        atol=1e-5,
+        msg=f"Masked input mismatch for case: {test_case}")
+    torch.testing.assert_close(custom_mask,
+                               ref_mask,
+                               rtol=1e-5,
+                               atol=1e-5,
+                               msg=f"Mask mismatch for case: {test_case}")
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import os
+
+import pytest
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "wemaster/deepseek_mtp_main_random_bf16"
+
+
+def test_mtp_correctness(
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using mtp speculative decoding.
+    '''
+    with VllmRunner(model_name,
+                    tensor_parallel_size=1,
+                    gpu_memory_utilization=0.7,
+                    max_model_len=256,
+                    enforce_eager=True) as ref_llm:
+        ref_outputs = ref_llm.generate(example_prompts, sampling_config)
+
+    with VllmRunner(
+            model_name,
+            tensor_parallel_size=1,
+            max_num_seqs=256,
+            gpu_memory_utilization=0.7,
+            distributed_executor_backend="mp",
+            enable_expert_parallel=True,
+            speculative_config={
+                "method": "deepseek_mtp",
+                "num_speculative_tokens": 1,
+            },
+            enforce_eager=True,
+            max_model_len=2000,
+            additional_config={"ascend_scheduler_config": {
+                "enabled": False
+            }}) as spec_llm:
+        spec_outputs = spec_llm.generate(example_prompts, sampling_config)
+
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        ref_token_ids = ref_output[0][0]
+        spec_token_ids = spec_output[0][0]
+        if ref_token_ids == spec_token_ids[:len(ref_token_ids)]:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output[1][0]}")
+            print(f"spec_output: {spec_output[1][0]}")
+
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+import os
+
+import pytest
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "wemaster/deepseek_mtp_main_random_bf16"
+
+
+def test_mtp_torchair_correctness(
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    example_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using mtp speculative decoding.
+    '''
+    with VllmRunner(model_name,
+                    tensor_parallel_size=1,
+                    gpu_memory_utilization=0.7,
+                    max_model_len=256,
+                    enforce_eager=False,
+                    additional_config={
+                        "torchair_graph_config": {
+                            "enabled": True,
+                            "use_cached_graph": False,
+                            "graph_batch_sizes": [1, 2, 4],
+                        },
+                    }) as ref_llm:
+        ref_outputs = ref_llm.generate(example_prompts, sampling_config)
+    with VllmRunner(model_name,
+                    tensor_parallel_size=1,
+                    max_num_seqs=256,
+                    gpu_memory_utilization=0.7,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True,
+                    speculative_config={
+                        "method": "deepseek_mtp",
+                        "num_speculative_tokens": 1,
+                    },
+                    enforce_eager=False,
+                    max_model_len=2000,
+                    additional_config={
+                        "torchair_graph_config": {
+                            "enabled": True,
+                            "use_cached_graph": False,
+                            "graph_batch_sizes": [1, 2, 4],
+                        }
+                    }) as spec_llm:
+        spec_outputs = spec_llm.generate(example_prompts, sampling_config)
+
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        ref_token_ids = ref_output[0][0]
+        spec_token_ids = spec_output[0][0]
+        if ref_token_ids == spec_token_ids[:len(ref_token_ids)]:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output[1][0]}")
+            print(f"spec_output: {spec_output[1][0]}")
+
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
--- a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import random
+from typing import Any
+
+import pytest
+from vllm import LLM, SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 10
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "LLM-Research/Meta-Llama-3.1-8B-Instruct"
+
+
+def eagle_model_name():
+    return "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"
+
+
+def eagle3_model_name():
+    return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"
+
+
+def test_ngram_correctness(
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using ngram speculative decoding.
+    '''
+    pytest.skip("Not current support for the test.")
+    ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
+    with VllmRunner(model_name,
+                    speculative_config={
+                        "method": "ngram",
+                        "prompt_lookup_max": 5,
+                        "prompt_lookup_min": 3,
+                        "num_speculative_tokens": 3,
+                    },
+                    max_model_len=1024,
+                    enforce_eager=True) as runner:
+        spec_outputs = runner.model.chat(test_prompts, sampling_config)
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
+
+    # Heuristic: expect at least 70% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.7 * len(ref_outputs))
+
+
+@pytest.mark.skipif(True, reason="oom in CI, fix me")
+@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
+def test_eagle_correctness(
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+    use_eagle3: bool,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    if not use_eagle3:
+        pytest.skip("Not current support for the test.")
+
+    ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
+    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+    del ref_llm
+
+    spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
+    with VllmRunner(
+            model_name,
+            trust_remote_code=True,
+            enable_chunked_prefill=True,
+            max_num_seqs=1,
+            max_num_batched_tokens=2048,
+            gpu_memory_utilization=0.6,
+            speculative_config={
+                "method": "eagle3" if use_eagle3 else "eagle",
+                "model": spec_model_name,
+                "num_speculative_tokens": 2,
+                "max_model_len": 128,
+            },
+            max_model_len=128,
+            enforce_eager=True,
+    ) as runner:
+        spec_outputs = runner.model.chat(test_prompts, sampling_config)
+
+    matches = 0
+    misses = 0
+    for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+        if ref_output.outputs[0].text == spec_output.outputs[0].text:
+            matches += 1
+        else:
+            misses += 1
+            print(f"ref_output: {ref_output.outputs[0].text}")
+            print(f"spec_output: {spec_output.outputs[0].text}")
+
+    # Heuristic: expect at least 66% of the prompts to match exactly
+    # Upon failure, inspect the outputs to check for inaccuracy.
+    assert matches > int(0.66 * len(ref_outputs))
--- a/tests/e2e/singlecard/test_aclgraph.py
+++ b/tests/e2e/singlecard/test_aclgraph.py
@@ -0,0 +1,75 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/compile/test_aclgraph.py`.
+"""
+
+import pytest
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+MODELS = [
+    "Qwen/Qwen3-0.6B",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models_with_aclgraph(
+    model: str,
+    max_tokens: int,
+) -> None:
+    prompts = [
+        "Hello, my name is", "The president of the United States is",
+        "The capital of France is", "The future of AI is"
+    ]
+
+    sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            enforce_eager=False,
+    ) as runner:
+        vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params)
+
+    with VllmRunner(
+            model,
+            max_model_len=1024,
+            enforce_eager=True,
+    ) as runner:
+        vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
+
+    vllm_aclgraph_outputs_list = []
+    for output in vllm_aclgraph_outputs:
+        vllm_aclgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_aclgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_aclgraph_outputs",
+    )
--- a/tests/e2e/singlecard/test_ascend_scheduler.py
+++ b/tests/e2e/singlecard/test_ascend_scheduler.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.model_utils import check_outputs_equal
+
+MODEL = "Qwen/Qwen3-0.6B"
+
+
+def test_concurrent_partial_prefill():
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    max_num_seqs=3,
+                    max_num_batched_tokens=2048,
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
+                                            3)
+        assert len(outputs) == 3
+        for output in outputs:
+            assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded():
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    max_num_seqs=3,
+                    max_num_batched_tokens=2048,
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        # 17 tokens will make sure first 16 tokens are cached in a block
+        input_tokens = {"prompt_token_ids": [101] * 129}
+        _ = vllm_model.model.generate([input_tokens])
+        outputs = vllm_model.model.generate([input_tokens])
+        assert outputs[0].num_cached_tokens == 128
+
+
+@pytest.mark.parametrize("max_tokens",
+                         [4])  # cannot align results when max_tokens > 4
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_with_ascend_scheduler(
+        max_tokens: int, chunked_prefill_token_size: int) -> None:
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
+    ]
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_chunked_prefill': True,
+                        },
+                    },
+                    max_num_seqs=max_num_seqs,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        chunked_prefill_output = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with VllmRunner(MODEL,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=chunked_prefill_output,
+        name_0="vllm_output",
+        name_1="chunked_prefill_output",
+    )
--- a/tests/e2e/singlecard/test_camem.py
+++ b/tests/e2e/singlecard/test_camem.py
@@ -0,0 +1,96 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import gc
+
+import torch
+from vllm import SamplingParams
+from vllm.utils import GiB_bytes
+
+from tests.e2e.conftest import VllmRunner
+from tests.e2e.utils import fork_new_process_for_each_test
+from vllm_ascend.device_allocator.camem import CaMemAllocator
+
+
+@fork_new_process_for_each_test
+def test_basic_camem():
+    # some tensors from default memory pool
+    shape = (1024, 1024)
+    x = torch.empty(shape, device='npu:0')
+    x.zero_()
+
+    # some tensors from custom memory pool
+    allocator = CaMemAllocator.get_instance()
+    with allocator.use_memory_pool():
+        # custom memory pool
+        y = torch.empty(shape, device='npu:0')
+        y.zero_()
+        y += 1
+        z = torch.empty(shape, device='npu:0')
+        z.zero_()
+        z += 2
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+    free_bytes = torch.npu.mem_get_info()[0]
+    allocator.sleep()
+    free_bytes_after_sleep = torch.npu.mem_get_info()[0]
+    assert free_bytes_after_sleep > free_bytes
+    allocator.wake_up()
+
+    # they can be used together
+    output = x + y + z
+    assert torch.allclose(output, torch.ones_like(output) * 3)
+
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
+
+
+@fork_new_process_for_each_test
+def test_end_to_end():
+    free, total = torch.npu.mem_get_info()
+    used_bytes_baseline = total - free  # in case other process is running
+
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+    with VllmRunner("Qwen/Qwen3-0.6B",
+                    enforce_eager=True,
+                    enable_sleep_mode=True) as runner:
+
+        output = runner.model.generate(prompt, sampling_params)
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # which is difficult to measure in the test. therefore, we only
+        # test sleep level 1 here.
+        runner.model.sleep(level=1)
+
+        free_gpu_bytes_after_sleep, total = torch.npu.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage should be less than the model weights
+        # (0.5B model, 1GiB weights)
+        assert used_bytes < 1 * GiB_bytes
+
+        runner.model.wake_up()
+        output2 = runner.model.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
--- a/tests/e2e/singlecard/test_chunked.py
+++ b/tests/e2e/singlecard/test_chunked.py
@@ -0,0 +1,81 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/compile/test_aclgraph.py`.
+"""
+import gc
+
+import pytest
+import torch
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [1])
+def test_models(
+    model: str,
+    max_tokens: int,
+) -> None:
+    prompts = ["The president of the United States is"]
+
+    sampling_params = SamplingParams(
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+
+    with VllmRunner(model, long_prefill_token_threshold=20,
+                    enforce_eager=True) as vllm_model:
+        output1 = vllm_model.generate(prompts, sampling_params)
+
+    with VllmRunner(model,
+                    enforce_eager=True,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True
+                        },
+                    }) as vllm_model:
+        output2 = vllm_model.generate(prompts, sampling_params)
+
+    # Extract the generated token IDs for comparison
+    token_ids1 = output1[0][0][0]
+    token_ids2 = output2[0][0][0]
+
+    print(f"Token IDs 1: {token_ids1}")
+    print(f"Token IDs 2: {token_ids2}")
+
+    # Convert token IDs to tensors and calculate cosine similarity
+    # Take the length of a shorter sequence to ensure consistent dimensions
+    min_len = min(len(token_ids1), len(token_ids2))
+
+    tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32)
+    tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32)
+
+    # Calculate similarity using torch.cosine_similarity
+    similarity = torch.cosine_similarity(tensor1, tensor2, dim=0)
+    print(f"Token IDs cosine similarity: {similarity.item()}")
+
+    assert similarity > 0.95
+
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/test_embedding.py
+++ b/tests/e2e/singlecard/test_embedding.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+
+from tests.e2e.conftest import HfRunner, VllmRunner
+from tests.e2e.utils import check_embeddings_close
+
+
+def test_embed_models_correctness():
+    queries = ['What is the capital of China?', 'Explain gravity']
+
+    model_name = snapshot_download("Qwen/Qwen3-Embedding-0.6B")
+    with VllmRunner(
+            model_name,
+            task="embed",
+            enforce_eager=True,
+    ) as vllm_runner:
+        vllm_outputs = vllm_runner.encode(queries)
+
+    with HfRunner(
+            model_name,
+            dtype="float32",
+            is_sentence_transformer=True,
+    ) as hf_runner:
+        hf_outputs = hf_runner.encode(queries)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
--- a/tests/e2e/singlecard/test_guided_decoding.py
+++ b/tests/e2e/singlecard/test_guided_decoding.py
@@ -0,0 +1,150 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import json
+import os
+
+import jsonschema
+import pytest
+import regex as re
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
+
+
+@pytest.fixture(scope="module")
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture(scope="module")
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
+def test_guided_json_completion(guided_decoding_backend: str,
+                                sample_json_schema):
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=500,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+
+    with VllmRunner(
+            MODEL_NAME,
+            seed=0,
+            guided_decoding_backend=guided_decoding_backend,
+    ) as vllm_model:
+        prompts = [
+            f"Give an example JSON for an employee profile "
+            f"that fits this schema: {sample_json_schema}"
+        ] * 2
+        inputs = vllm_model.get_inputs(prompts)
+        outputs = vllm_model.model.generate(inputs,
+                                            sampling_params=sampling_params)
+
+        assert outputs is not None
+
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            prompt = output.prompt
+
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            output_json = json.loads(generated_text)
+            jsonschema.validate(instance=output_json,
+                                schema=sample_json_schema)
+
+
+@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
+def test_guided_regex(guided_decoding_backend: str, sample_regex):
+    if guided_decoding_backend == "outlines":
+        pytest.skip("Outlines doesn't support regex-based guided decoding.")
+
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+
+    with VllmRunner(
+            MODEL_NAME,
+            seed=0,
+            guided_decoding_backend=guided_decoding_backend,
+    ) as vllm_model:
+        prompts = [
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2
+        inputs = vllm_model.get_inputs(prompts)
+        outputs = vllm_model.model.generate(inputs,
+                                            sampling_params=sampling_params)
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(generated_text)
+            assert generated_text is not None
+            assert re.fullmatch(".*", generated_text) is not None
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/e2e/singlecard/test_ilama_lora.py
+++ b/tests/e2e/singlecard/test_ilama_lora.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+import vllm
+from modelscope import snapshot_download  # type: ignore
+from vllm.lora.request import LoRARequest
+
+from tests.e2e.conftest import VllmRunner
+
+MODEL_PATH = "vllm-ascend/ilama-3.2-1B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT DISTINCT Country FROM singer WHERE Age  >  20",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_ilama_lora(ilama_lora_files):
+    with VllmRunner(snapshot_download(MODEL_PATH),
+                    enable_lora=True,
+                    dtype="half",
+                    max_loras=4,
+                    max_model_len=1024,
+                    max_num_seqs=16,
+                    enforce_eager=True) as vllm_model:
+
+        output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
+        for i in range(len(EXPECTED_LORA_OUTPUT)):
+            assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+
+        output2 = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
+        for i in range(len(EXPECTED_LORA_OUTPUT)):
+            assert output2[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/e2e/singlecard/test_profile_execute_duration.py
+++ b/tests/e2e/singlecard/test_profile_execute_duration.py
@@ -0,0 +1,71 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import gc
+import os
+import time
+from unittest.mock import patch
+
+import torch
+import vllm  # noqa: F401
+
+from vllm_ascend.utils import ProfileExecuteDuration
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": "1"})
+def test_execue_duration_enabled_discrepancy():
+    a = torch.randn(10000, 10000).npu()
+    b = torch.randn(10000, 10000).npu()
+
+    # warmup
+    torch.matmul(a, b)
+    torch.npu.synchronize()
+
+    cpu_start = time.perf_counter()
+    with ProfileExecuteDuration().capture_async("forward"):
+        torch.matmul(a, b)
+        torch.npu.synchronize()
+        cpu_duration = (time.perf_counter() - cpu_start) * 1000
+    npu_durations = ProfileExecuteDuration().pop_captured_sync()
+    assert npu_durations and 'forward' in npu_durations
+    assert not ProfileExecuteDuration._observations
+
+    # Assert discrepancy between CPU and NPU duration is within 50% roughly
+    diff = abs(cpu_duration - npu_durations['forward']) / max(
+        cpu_duration, npu_durations['forward'])
+    assert diff <= 0.5, (
+        f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms")
+
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
+
+
+def test_execue_duration_disabled():
+    a = torch.randn(100, 100).npu()
+    b = torch.randn(100, 100).npu()
+
+    with ProfileExecuteDuration().capture_async("forward"):
+        torch.matmul(a, b)
+        torch.npu.synchronize()
+    npu_durations = ProfileExecuteDuration().pop_captured_sync()
+    assert not npu_durations
+
+    gc.collect()
+    torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/test_quantization.py
+++ b/tests/e2e/singlecard/test_quantization.py
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+from modelscope import snapshot_download  # type: ignore[import-untyped]
+
+from tests.e2e.conftest import VllmRunner
+
+
+def test_quant_W8A8():
+    max_tokens = 5
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
+    ]
+    with VllmRunner(
+            snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
+            max_model_len=8192,
+            enforce_eager=True,
+            gpu_memory_utilization=0.7,
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/e2e/singlecard/test_sampler.py
+++ b/tests/e2e/singlecard/test_sampler.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from vllm import SamplingParams
+
+from tests.e2e.conftest import VllmRunner
+
+
+def test_models_topk() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    sampling_params = SamplingParams(max_tokens=5,
+                                     temperature=0.0,
+                                     top_k=50,
+                                     top_p=0.9)
+
+    with VllmRunner("Qwen/Qwen3-0.6B",
+                    max_model_len=8192,
+                    gpu_memory_utilization=0.7) as runner:
+        runner.generate(example_prompts, sampling_params)
+
+
+def test_models_prompt_logprobs() -> None:
+    example_prompts = [
+        "Hello, my name is",
+    ]
+
+    with VllmRunner("Qwen/Qwen3-0.6B",
+                    max_model_len=8192,
+                    gpu_memory_utilization=0.7) as runner:
+        runner.generate_greedy_logprobs(example_prompts,
+                                        max_tokens=5,
+                                        num_logprobs=1)
--- a/tests/e2e/singlecard/test_vlm.py
+++ b/tests/e2e/singlecard/test_vlm.py
@@ -0,0 +1,89 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/test_offline_inference.py`.
+"""
+import os
+
+import pytest
+from vllm import SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+
+from tests.e2e.conftest import VllmRunner
+
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
+
+
+@pytest.mark.skip(reason="fix me")
+def test_multimodal_vl(prompt_template):
+    image = ImageAsset("cherry_blossom") \
+        .pil_image.convert("RGB")
+    img_questions = [
+        "What is the content of this image?",
+        "Describe the content of this image in detail.",
+        "What's in the image?",
+        "Where is this image taken?",
+    ]
+    images = [image] * len(img_questions)
+    prompts = prompt_template(img_questions)
+    with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct",
+                    max_model_len=4096,
+                    mm_processor_kwargs={
+                        "min_pixels": 28 * 28,
+                        "max_pixels": 1280 * 28 * 28,
+                        "fps": 1,
+                    },
+                    enforce_eager=True) as vllm_model:
+        vllm_model.generate_greedy(prompts=prompts,
+                                   images=images,
+                                   max_tokens=64)
+
+
+def test_multimodal_audio():
+    audio_prompt = "".join([
+        f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+        for idx in range(2)
+    ])
+    question = "What sport and what nursery rhyme are referenced?"
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    mm_data = {
+        "audio": [
+            asset.audio_and_sample_rate for asset in
+            [AudioAsset("mary_had_lamb"),
+             AudioAsset("winning_call")]
+        ]
+    }
+    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=10,
+                                     stop_token_ids=None)
+
+    with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct",
+                    max_model_len=4096,
+                    max_num_seqs=5,
+                    dtype="bfloat16",
+                    limit_mm_per_prompt={"audio": 2},
+                    gpu_memory_utilization=0.9) as runner:
+        runner.generate(inputs, sampling_params=sampling_params)