init v0.11.0rc0

2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions
--- a/tests/e2e/singlecard/ops/test_bgmv_expand.py
+++ b/tests/e2e/singlecard/ops/test_bgmv_expand.py
@@ -33,8 +33,8 @@ def test_bgmv_expand():
    y_npu = y.npu()

    y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
-    y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
-                                         128)
+    y_out_npu = torch.ops._C_ascend.bgmv_expand(x_npu, w_npu, indices_npu,
+                                                y_npu, 0, 128)

    # Compare the results.
    torch.testing.assert_close(y_out_npu.cpu(),
--- a/tests/e2e/singlecard/ops/test_bgmv_shrink.py
+++ b/tests/e2e/singlecard/ops/test_bgmv_shrink.py
@@ -33,7 +33,7 @@ def test_bgmv_shrink():
    y_npu = y.npu()

    y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
-    torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
+    torch.ops._C_ascend.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)

    # Compare the results.
    torch.testing.assert_close(y_npu.cpu(),
--- a/tests/e2e/singlecard/ops/test_fused_moe.py
+++ b/tests/e2e/singlecard/ops/test_fused_moe.py
@@ -28,12 +28,12 @@ import torch
 import torch_npu
 from vllm.model_executor.layers.activation import SiluAndMul

-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
-    TokenDispatcherWithAllGather
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
+from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather

 NUM_EXPERTS = [8, 64]
-EP_SIZE = [1, 4]
+EP_SIZE = [1]
 TOP_KS = [2, 6]
 DEVICE = ["npu"]

@@ -115,19 +115,6 @@ def test_token_dispatcher_with_all_gather(
    w1_local = w1
    w2_local = w2

-    if ep_size > 1:
-        local_e = e // ep_size
-        e_ids = torch.arange(local_e * 0,
-                             local_e * (0 + 1),
-                             device=device,
-                             dtype=torch.int32)
-        expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
-        expert_map[e_ids] = torch.arange(local_e,
-                                         device=device,
-                                         dtype=torch.int32)
-        w1_local = w1[e_ids]
-        w2_local = w2[e_ids]
-
    score = torch.softmax(score, dim=-1, dtype=dtype)
    topk_weights, topk_ids = torch.topk(score, topk)
    topk_ids = topk_ids.to(torch.int32)
@@ -179,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
    torch.npu.reset_peak_memory_stats()


+@pytest.mark.parametrize("m", [1, 33, 64])
+@pytest.mark.parametrize("n", [128, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("device", DEVICE)
+def test_token_dispatcher_with_all_gather_quant(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    ep_size: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    context_mock = MagicMock()
+    context_mock.fused_moe_state = 0
+    with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
+               return_value=context_mock):
+        a = torch.randn((m, k), device=device, dtype=dtype) / 10
+        w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
+        w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
+        w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
+        w2_scale = torch.empty((e, k), device=device, dtype=dtype)
+
+        score = torch.randn((m, e), device=device, dtype=dtype)
+        expert_map = None
+        local_e = e
+
+        score = torch.softmax(score, dim=-1, dtype=dtype)
+        topk_weights, topk_ids = torch.topk(score, topk)
+        topk_ids = topk_ids.to(torch.int32)
+        row_idx = (torch.arange(
+            0,
+            m * topk,
+            device=device,
+            dtype=torch.int32,
+        ).view(topk, -1).permute(1, 0).contiguous())
+
+        dispatcher_kwargs = {
+            "num_experts": e,
+            "top_k": topk,
+            "num_local_experts": local_e,
+        }
+        dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
+
+        apply_router_weight_on_input = False
+        dispatch_output = dispatcher.token_dispatch(
+            hidden_states=a,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            row_idx=row_idx,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            with_quant=True)
+
+        sorted_hidden_states = dispatch_output["hidden_states"]
+        group_list = dispatch_output["group_list"]
+        group_list_type = dispatch_output.get("group_list_type", 1)
+        dynamic_scale = dispatch_output["dynamic_scale"]
+
+        expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
+                                          w1=w1,
+                                          w1_scale=w1_scale,
+                                          w2=w2,
+                                          w2_scale=w2_scale,
+                                          group_list=group_list,
+                                          group_list_type=group_list_type,
+                                          dynamic_scale=dynamic_scale,
+                                          with_quant=True)
+        combined_output = dispatcher.token_combine(hidden_states=expert_output,
+                                                   bias=None)
+        assert combined_output.shape == (m, k)
+        gc.collect()
+        torch.npu.empty_cache()
+        torch.npu.reset_peak_memory_stats()
+
+
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@@ -222,7 +290,7 @@ def test_select_experts(
                                 dtype=torch.int32)
        custom_routing_function.return_value = (mock_weights, mock_ids)

-    with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
+    with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
               ) as mock_native_grouped_topk:
        mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
            x)
--- a/tests/e2e/singlecard/ops/test_moe_comm.py
+++ b/tests/e2e/singlecard/ops/test_moe_comm.py
@@ -1,175 +0,0 @@
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-import gc
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.fused_moe.config import (  # isort: skip
-    FusedMoEConfig, FusedMoEParallelConfig)
-
-from vllm_ascend.distributed.moe_comm_method import (  # isort: skip
-    AllGatherCommImpl, NativeAllGatherCommImpl)
-
-
-@pytest.mark.parametrize("num_tokens", [16, 128])
-@pytest.mark.parametrize("hidden_size", [64, 128])
-@pytest.mark.parametrize("global_num_experts", [8, 16])
-@pytest.mark.parametrize("num_local_experts", [4, 8])
-@pytest.mark.parametrize("top_k_num", [2, 4])
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("ep_rank", [0, 1])
-@pytest.mark.parametrize("apply_a8_quantization", [False])
-def test_all_gather_comm_impl(
-    num_tokens,
-    hidden_size,
-    global_num_experts,
-    num_local_experts,
-    top_k_num,
-    dtype,
-    ep_rank,
-    apply_a8_quantization,
-    mocker,
-):
-    """
-    Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
-
-    This test compares the outputs of the NPU-optimized AllGatherCommImpl
-    with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
-    correctness across various configurations.
-    """
-    if top_k_num > global_num_experts:
-        pytest.skip("top_k_num cannot be greater than global_num_experts")
-    if num_local_experts > global_num_experts:
-        pytest.skip(
-            "num_local_experts cannot be greater than global_num_experts")
-
-    device = torch.device("npu")
-
-    # mock get_tensor_model_parallel_rank to return ep_rank
-    mocker.patch(
-        "vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
-        return_value=ep_rank,
-    )
-
-    # make moe config
-    parallel_config = SimpleNamespace(
-        enable_expert_parallel=num_local_experts < global_num_experts)
-    moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
-        tp_size_=max(2, global_num_experts // num_local_experts),
-        dp_size_=1,
-        vllm_parallel_config=parallel_config,
-    )
-
-    moe_config = FusedMoEConfig(
-        num_experts=global_num_experts,
-        experts_per_token=top_k_num,
-        hidden_dim=hidden_size,
-        num_local_experts=num_local_experts,
-        moe_parallel_config=moe_parallel_config,
-        in_dtype=dtype,
-        quant_config=None,  # No quantization in this test
-        max_num_tokens=num_tokens,
-    )
-
-    # Instantiate implementations
-    native_impl = NativeAllGatherCommImpl(moe_config)
-
-    all_gather_impl = AllGatherCommImpl(moe_config)
-
-    # --- Input Data ---
-    hidden_states = torch.randn(num_tokens,
-                                hidden_size,
-                                device=device,
-                                dtype=dtype)
-    topk_ids = torch.randint(0,
-                             global_num_experts, (num_tokens, top_k_num),
-                             device=device,
-                             dtype=torch.int32)
-    topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
-    topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
-
-    num_experts = global_num_experts
-
-    expert_map = None
-    if num_local_experts < global_num_experts:
-        # Create a map where some experts are local and some are not
-        expert_map = torch.full((global_num_experts, ), -1, device=device)
-        expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
-                   num_local_experts] = torch.arange(num_local_experts,
-                                                     device=device)
-    num_experts = num_local_experts
-
-    # --- Run Native Implementation (Golden Reference) ---
-    native_hidden_states_out = hidden_states.clone()
-    (
-        native_permuted_hidden,
-        native_expert_tokens,
-        _,
-        _,
-    ) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
-                            num_experts, apply_a8_quantization)
-    # Simulate MLP output
-    native_mlp_output = torch.randn_like(native_permuted_hidden)
-    native_impl.unpermute(native_mlp_output, native_hidden_states_out)
-
-    # --- Run AllGather Implementation ---
-    all_gather_hidden_states_out = hidden_states.clone()
-    (
-        all_gather_permuted_hidden,
-        all_gather_expert_tokens,
-        _,
-        _,
-    ) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
-                                expert_map, num_experts, apply_a8_quantization)
-
-    # Use the same simulated MLP output for a fair comparison
-    all_gather_mlp_output = native_mlp_output.clone()
-
-    all_gather_impl.unpermute(all_gather_mlp_output,
-                              all_gather_hidden_states_out)
-
-    # --- Assertions ---
-    # Define tolerance based on dtype
-    atol = 1e-3 if dtype == torch.float16 else 1e-2
-    rtol = 1e-3 if dtype == torch.float16 else 1e-2
-
-    # 1. Compare expert_tokens from pre_process
-    assert torch.allclose(native_expert_tokens.to(
-        all_gather_expert_tokens.device),
-                          all_gather_expert_tokens,
-                          atol=atol,
-                          rtol=rtol), "Expert tokens do not match."
-
-    # 2. Compare permuted_hidden_states from pre_process
-    num_valid_tokens = native_expert_tokens.sum()
-    assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
-        all_gather_permuted_hidden.device),
-                          all_gather_permuted_hidden[:num_valid_tokens],
-                          atol=atol,
-                          rtol=rtol), "Permuted hidden states do not match."
-
-    # 3. Compare final hidden_states from post_process
-    assert torch.allclose(native_hidden_states_out.to(
-        all_gather_hidden_states_out.device),
-                          all_gather_hidden_states_out,
-                          atol=atol,
-                          rtol=rtol), "Final hidden states do not match."
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
--- a/tests/e2e/singlecard/ops/test_rotary_embedding.py
+++ b/tests/e2e/singlecard/ops/test_rotary_embedding.py
@@ -182,7 +182,7 @@ def test_rotary_embedding_quant_with_leading_dim(
    )

    ref_query, ref_key = rope.forward_native(positions, query, key)
-    query, key = torch.ops._C.rotary_embedding(
+    query, key = torch.ops._C_ascend.rotary_embedding(
        positions,
        query,
        key,
@@ -239,7 +239,7 @@ class ModelwithRotaryEmbedding(nn.Module):
        # we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
        qkv = self.qkv_proj(hidden_states)
        q, k, v = qkv.chunk(3, dim=-1)
-        query, key = torch.ops._C.rotary_embedding(
+        query, key = torch.ops._C_ascend.rotary_embedding(
            positions,
            q,
            k,
@@ -299,7 +299,7 @@ def test_capture_rotary_embedding_in_aclgraph(
        # Validate if the rotary_embedding custom kernel is indeed inside the graph by
        # string match
        graph = str(gm.graph)
-        assert "_C.rotary_embedding" in graph
+        assert "_C_ascend.rotary_embedding" in graph
        return gm

    static_positions = torch.randint(0, max_position_embeddings,
--- a/tests/e2e/singlecard/ops/test_vocabparallelembedding.py
+++ b/tests/e2e/singlecard/ops/test_vocabparallelembedding.py
@@ -72,7 +72,7 @@ def test_get_masked_input_and_mask(

    # Get custom op result
    print("input_tensor:", input_tensor)
-    custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
+    custom_masked_input, custom_mask = torch.ops._C_ascend.get_masked_input_and_mask(
        input_tensor, test_case["org_start"], test_case["org_end"],
        test_case["padding"], test_case["added_start"], test_case["added_end"])