init v0.11.0rc0

This commit is contained in:
2025-10-14 10:38:28 +08:00
parent 67afd0ea78
commit 66dc16f966
278 changed files with 28130 additions and 11708 deletions

View File

@@ -33,8 +33,8 @@ def test_bgmv_expand():
y_npu = y.npu()
y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
128)
y_out_npu = torch.ops._C_ascend.bgmv_expand(x_npu, w_npu, indices_npu,
y_npu, 0, 128)
# Compare the results.
torch.testing.assert_close(y_out_npu.cpu(),

View File

@@ -33,7 +33,7 @@ def test_bgmv_shrink():
y_npu = y.npu()
y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
torch.ops._C_ascend.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
# Compare the results.
torch.testing.assert_close(y_npu.cpu(),

View File

@@ -28,12 +28,12 @@ import torch
import torch_npu
from vllm.model_executor.layers.activation import SiluAndMul
from vllm_ascend.ops.layers.experts_selector import select_experts
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
TokenDispatcherWithAllGather
from vllm_ascend.ops.moe.experts_selector import select_experts
from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
NUM_EXPERTS = [8, 64]
EP_SIZE = [1, 4]
EP_SIZE = [1]
TOP_KS = [2, 6]
DEVICE = ["npu"]
@@ -115,19 +115,6 @@ def test_token_dispatcher_with_all_gather(
w1_local = w1
w2_local = w2
if ep_size > 1:
local_e = e // ep_size
e_ids = torch.arange(local_e * 0,
local_e * (0 + 1),
device=device,
dtype=torch.int32)
expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
expert_map[e_ids] = torch.arange(local_e,
device=device,
dtype=torch.int32)
w1_local = w1[e_ids]
w2_local = w2[e_ids]
score = torch.softmax(score, dim=-1, dtype=dtype)
topk_weights, topk_ids = torch.topk(score, topk)
topk_ids = topk_ids.to(torch.int32)
@@ -179,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
torch.npu.reset_peak_memory_stats()
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("device", DEVICE)
def test_token_dispatcher_with_all_gather_quant(
m: int,
n: int,
k: int,
e: int,
topk: int,
ep_size: int,
dtype: torch.dtype,
device: str,
):
context_mock = MagicMock()
context_mock.fused_moe_state = 0
with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
return_value=context_mock):
a = torch.randn((m, k), device=device, dtype=dtype) / 10
w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
w2_scale = torch.empty((e, k), device=device, dtype=dtype)
score = torch.randn((m, e), device=device, dtype=dtype)
expert_map = None
local_e = e
score = torch.softmax(score, dim=-1, dtype=dtype)
topk_weights, topk_ids = torch.topk(score, topk)
topk_ids = topk_ids.to(torch.int32)
row_idx = (torch.arange(
0,
m * topk,
device=device,
dtype=torch.int32,
).view(topk, -1).permute(1, 0).contiguous())
dispatcher_kwargs = {
"num_experts": e,
"top_k": topk,
"num_local_experts": local_e,
}
dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
apply_router_weight_on_input = False
dispatch_output = dispatcher.token_dispatch(
hidden_states=a,
topk_weights=topk_weights,
topk_ids=topk_ids,
row_idx=row_idx,
expert_map=expert_map,
apply_router_weight_on_input=apply_router_weight_on_input,
with_quant=True)
sorted_hidden_states = dispatch_output["hidden_states"]
group_list = dispatch_output["group_list"]
group_list_type = dispatch_output.get("group_list_type", 1)
dynamic_scale = dispatch_output["dynamic_scale"]
expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
w1=w1,
w1_scale=w1_scale,
w2=w2,
w2_scale=w2_scale,
group_list=group_list,
group_list_type=group_list_type,
dynamic_scale=dynamic_scale,
with_quant=True)
combined_output = dispatcher.token_combine(hidden_states=expert_output,
bias=None)
assert combined_output.shape == (m, k)
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@@ -222,7 +290,7 @@ def test_select_experts(
dtype=torch.int32)
custom_routing_function.return_value = (mock_weights, mock_ids)
with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
) as mock_native_grouped_topk:
mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
x)

View File

@@ -1,175 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
import gc
from types import SimpleNamespace
import pytest
import torch
from vllm.model_executor.layers.fused_moe.config import ( # isort: skip
FusedMoEConfig, FusedMoEParallelConfig)
from vllm_ascend.distributed.moe_comm_method import ( # isort: skip
AllGatherCommImpl, NativeAllGatherCommImpl)
@pytest.mark.parametrize("num_tokens", [16, 128])
@pytest.mark.parametrize("hidden_size", [64, 128])
@pytest.mark.parametrize("global_num_experts", [8, 16])
@pytest.mark.parametrize("num_local_experts", [4, 8])
@pytest.mark.parametrize("top_k_num", [2, 4])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("ep_rank", [0, 1])
@pytest.mark.parametrize("apply_a8_quantization", [False])
def test_all_gather_comm_impl(
num_tokens,
hidden_size,
global_num_experts,
num_local_experts,
top_k_num,
dtype,
ep_rank,
apply_a8_quantization,
mocker,
):
"""
Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
This test compares the outputs of the NPU-optimized AllGatherCommImpl
with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
correctness across various configurations.
"""
if top_k_num > global_num_experts:
pytest.skip("top_k_num cannot be greater than global_num_experts")
if num_local_experts > global_num_experts:
pytest.skip(
"num_local_experts cannot be greater than global_num_experts")
device = torch.device("npu")
# mock get_tensor_model_parallel_rank to return ep_rank
mocker.patch(
"vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
return_value=ep_rank,
)
# make moe config
parallel_config = SimpleNamespace(
enable_expert_parallel=num_local_experts < global_num_experts)
moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
tp_size_=max(2, global_num_experts // num_local_experts),
dp_size_=1,
vllm_parallel_config=parallel_config,
)
moe_config = FusedMoEConfig(
num_experts=global_num_experts,
experts_per_token=top_k_num,
hidden_dim=hidden_size,
num_local_experts=num_local_experts,
moe_parallel_config=moe_parallel_config,
in_dtype=dtype,
quant_config=None, # No quantization in this test
max_num_tokens=num_tokens,
)
# Instantiate implementations
native_impl = NativeAllGatherCommImpl(moe_config)
all_gather_impl = AllGatherCommImpl(moe_config)
# --- Input Data ---
hidden_states = torch.randn(num_tokens,
hidden_size,
device=device,
dtype=dtype)
topk_ids = torch.randint(0,
global_num_experts, (num_tokens, top_k_num),
device=device,
dtype=torch.int32)
topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
num_experts = global_num_experts
expert_map = None
if num_local_experts < global_num_experts:
# Create a map where some experts are local and some are not
expert_map = torch.full((global_num_experts, ), -1, device=device)
expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
num_local_experts] = torch.arange(num_local_experts,
device=device)
num_experts = num_local_experts
# --- Run Native Implementation (Golden Reference) ---
native_hidden_states_out = hidden_states.clone()
(
native_permuted_hidden,
native_expert_tokens,
_,
_,
) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
num_experts, apply_a8_quantization)
# Simulate MLP output
native_mlp_output = torch.randn_like(native_permuted_hidden)
native_impl.unpermute(native_mlp_output, native_hidden_states_out)
# --- Run AllGather Implementation ---
all_gather_hidden_states_out = hidden_states.clone()
(
all_gather_permuted_hidden,
all_gather_expert_tokens,
_,
_,
) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
expert_map, num_experts, apply_a8_quantization)
# Use the same simulated MLP output for a fair comparison
all_gather_mlp_output = native_mlp_output.clone()
all_gather_impl.unpermute(all_gather_mlp_output,
all_gather_hidden_states_out)
# --- Assertions ---
# Define tolerance based on dtype
atol = 1e-3 if dtype == torch.float16 else 1e-2
rtol = 1e-3 if dtype == torch.float16 else 1e-2
# 1. Compare expert_tokens from pre_process
assert torch.allclose(native_expert_tokens.to(
all_gather_expert_tokens.device),
all_gather_expert_tokens,
atol=atol,
rtol=rtol), "Expert tokens do not match."
# 2. Compare permuted_hidden_states from pre_process
num_valid_tokens = native_expert_tokens.sum()
assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
all_gather_permuted_hidden.device),
all_gather_permuted_hidden[:num_valid_tokens],
atol=atol,
rtol=rtol), "Permuted hidden states do not match."
# 3. Compare final hidden_states from post_process
assert torch.allclose(native_hidden_states_out.to(
all_gather_hidden_states_out.device),
all_gather_hidden_states_out,
atol=atol,
rtol=rtol), "Final hidden states do not match."
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()

View File

@@ -182,7 +182,7 @@ def test_rotary_embedding_quant_with_leading_dim(
)
ref_query, ref_key = rope.forward_native(positions, query, key)
query, key = torch.ops._C.rotary_embedding(
query, key = torch.ops._C_ascend.rotary_embedding(
positions,
query,
key,
@@ -239,7 +239,7 @@ class ModelwithRotaryEmbedding(nn.Module):
# we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
qkv = self.qkv_proj(hidden_states)
q, k, v = qkv.chunk(3, dim=-1)
query, key = torch.ops._C.rotary_embedding(
query, key = torch.ops._C_ascend.rotary_embedding(
positions,
q,
k,
@@ -299,7 +299,7 @@ def test_capture_rotary_embedding_in_aclgraph(
# Validate if the rotary_embedding custom kernel is indeed inside the graph by
# string match
graph = str(gm.graph)
assert "_C.rotary_embedding" in graph
assert "_C_ascend.rotary_embedding" in graph
return gm
static_positions = torch.randint(0, max_position_embeddings,

View File

@@ -72,7 +72,7 @@ def test_get_masked_input_and_mask(
# Get custom op result
print("input_tensor:", input_tensor)
custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
custom_masked_input, custom_mask = torch.ops._C_ascend.get_masked_input_and_mask(
input_tensor, test_case["org_start"], test_case["org_end"],
test_case["padding"], test_case["added_start"], test_case["added_end"])