2025-04-28 21:57:01 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
2026-01-17 11:53:22 +08:00
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
from functools import wraps
|
|
|
|
|
from typing import Callable, Optional
|
2025-04-28 21:57:01 +08:00
|
|
|
|
|
|
|
|
import torch
|
2026-01-17 11:53:22 +08:00
|
|
|
import torch.nn.functional as F
|
2025-11-24 20:33:56 +08:00
|
|
|
from vllm.config import get_current_vllm_config
|
2025-09-09 18:19:56 +08:00
|
|
|
from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
|
|
|
|
|
tensor_model_parallel_all_reduce)
|
2025-08-12 21:10:20 +08:00
|
|
|
from vllm.forward_context import get_forward_context
|
2025-10-11 14:04:02 +08:00
|
|
|
from vllm.logger import logger
|
2025-10-09 14:12:46 +08:00
|
|
|
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
|
2025-08-26 19:05:23 +08:00
|
|
|
from vllm.model_executor.layers.fused_moe.layer import (
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map)
|
2025-11-24 17:08:20 +08:00
|
|
|
from vllm.model_executor.layers.fused_moe.shared_fused_moe import \
|
|
|
|
|
SharedFusedMoE
|
2025-04-28 21:57:01 +08:00
|
|
|
|
2025-08-04 15:23:20 +08:00
|
|
|
from vllm_ascend.ascend_config import get_ascend_config
|
2025-09-22 19:12:58 +08:00
|
|
|
from vllm_ascend.ascend_forward_context import MoECommType
|
2025-08-26 19:05:23 +08:00
|
|
|
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
|
2025-12-14 09:34:13 +08:00
|
|
|
from vllm_ascend.flash_common3_context import (get_flash_common3_context,
|
|
|
|
|
set_flash_common3_context)
|
2025-12-31 17:06:55 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.experts_selector import (select_experts,
|
|
|
|
|
zero_experts_compute)
|
2025-12-14 09:34:13 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.moe_comm_method import (AllGatherCommImpl,
|
2025-12-31 14:24:37 +08:00
|
|
|
FusedExpertsResult,
|
2025-12-14 09:34:13 +08:00
|
|
|
setup_moe_comm_method)
|
2025-11-13 11:02:31 +08:00
|
|
|
from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType
|
2025-12-19 14:27:24 +08:00
|
|
|
from vllm_ascend.utils import (AscendDeviceType, enable_sp,
|
|
|
|
|
get_ascend_device_type, maybe_trans_nz,
|
[refact] unified soc_version code (#4359)
### What this PR does / why we need it?
Currently, there are two paths to judge the chip type in code,
`get_ascend_soc_version` use `get_soc_version` api in torch_npu, and
`is_310p` `use _build_info.__soc_version__`, which generate when
install. We need to unify the two paths.
We need to unify these codes based on the following points:
1. We need to ensure consistency in chip type judgment between compiling
and running states;
2. In compiling state, we need chip type to complete op's compilation,
but in running state, we only need device
type(910B/910_93/310P/910_95/etc) to make code branch judgement;
3. In compiling state, torch_npu may not have been installed yet, so we
can't use torch_npu's api.
Based on the above points, we have made the following changes:
1. When user set env `SOC_VERSION`, use it; when not set, query
soc_version by `npu-smi`;
2. generate device_type based on soc_version when compiling, and write
`__device_type__` instead of `__soc_version__` in `_build_info.py`;
3. In running state, use `__device_type__` to judge code branch.
### Does this PR introduce _any_ user-facing change?
When not set env `SOC_VERSION`, it will not be `ASCEND910B1` by default,
we will query soc_version by `npu-smi`. And env `SOC_VERSION` must be in
the list `soc_to_device` in `setup.py`.
- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379
Signed-off-by: zzzzwwjj <1183291235@qq.com>
2025-11-26 14:28:55 +08:00
|
|
|
npu_stream_switch, shared_expert_dp_enabled,
|
2026-02-05 19:31:17 +08:00
|
|
|
shared_experts_calculation_stream,
|
|
|
|
|
vllm_version_is)
|
2026-01-17 11:53:22 +08:00
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class FusedMoEResult:
|
|
|
|
|
routed_out: torch.Tensor
|
|
|
|
|
before_dispatch_evt: torch.npu.Event | None = None
|
|
|
|
|
before_combine_evt: torch.npu.Event | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class FusedMoEEvents:
|
|
|
|
|
before_routed_experts: torch.npu.Event
|
|
|
|
|
before_dispatch: torch.npu.Event | None = field(default=None)
|
|
|
|
|
before_combine: torch.npu.Event | None = field(default=None)
|
2025-04-28 21:57:01 +08:00
|
|
|
|
2025-07-06 15:29:36 +08:00
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
|
2025-07-06 15:29:36 +08:00
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
def __init__(self, moe: FusedMoEConfig = None):
|
2025-08-04 15:23:20 +08:00
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
super().__init__(moe=moe)
|
2026-01-15 10:26:44 +08:00
|
|
|
self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb
|
2025-10-09 14:12:46 +08:00
|
|
|
|
|
|
|
|
def process_weights_after_loading(self, layer):
|
|
|
|
|
super(UnquantizedFusedMoEMethod,
|
|
|
|
|
self).process_weights_after_loading(layer)
|
|
|
|
|
|
2025-12-08 20:34:52 +08:00
|
|
|
w13_data = self._maybe_pad_weight(layer.w13_weight.data).transpose(
|
|
|
|
|
1, 2).contiguous()
|
|
|
|
|
layer.w13_weight = torch.nn.Parameter(w13_data, requires_grad=False)
|
2025-10-09 14:12:46 +08:00
|
|
|
|
2025-12-08 20:34:52 +08:00
|
|
|
w2_data = self._maybe_pad_weight(layer.w2_weight.data).transpose(
|
|
|
|
|
1, 2).contiguous()
|
|
|
|
|
layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False)
|
2025-10-09 14:12:46 +08:00
|
|
|
|
[Feat.]: 310p support MOE models (#6530)
### What this PR does / why we need it?
This pull request integrates comprehensive support for Mixture of
Experts (MoE) models on the Ascend 310P device within the vllm-ascend
framework. It achieves this by introducing specialized modules for
expert selection, fused MoE layers, and optimized all-gather
communication. The changes also refine existing NPU operations, making
them more consistent and efficient for 310P, ultimately enhancing the
performance and compatibility of MoE models on this hardware.
Highlights
310P MoE Support: Introduces dedicated implementations for Mixture of
Experts (MoE) models on Ascend 310P devices, including new modules for
expert selection, fused MoE layers, and communication.
All-Gather Communication: Enforces the use of ALLGATHER communication
for MoE operations on 310P, optimizing data transfer and leveraging
NPU-specific token dispatching.
Simplified NPU Operations: Removes conditional type casting for
npu_swiglu and enables custom rotary embedding kernels unconditionally,
suggesting improved native support for 310P.
New MoE Classes Registered: Registers AscendFusedMoE310 and
AscendSharedFusedMoE310 to integrate 310P-specific MoE layers into the
system's custom operation registry.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
offline test and server test, with qwen3-30b-a3b,tp/ep 4 on 310p
- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0
---------
Signed-off-by: pu-zhe <zpuaa@outlook.com>
2026-02-06 10:30:56 +08:00
|
|
|
layer.w13_weight.data = maybe_trans_nz(layer.w13_weight.data)
|
|
|
|
|
layer.w2_weight.data = maybe_trans_nz(layer.w2_weight.data)
|
2025-10-09 14:12:46 +08:00
|
|
|
|
|
|
|
|
def apply(self,
|
|
|
|
|
layer: torch.nn.Module,
|
|
|
|
|
x: torch.Tensor,
|
|
|
|
|
use_grouped_topk: bool,
|
|
|
|
|
top_k: int,
|
|
|
|
|
router_logits: torch.Tensor,
|
|
|
|
|
renormalize: bool,
|
|
|
|
|
topk_group: Optional[int] = None,
|
|
|
|
|
num_expert_group: Optional[int] = None,
|
|
|
|
|
custom_routing_function: Optional[Callable] = None,
|
|
|
|
|
scoring_func: str = "softmax",
|
|
|
|
|
routed_scaling_factor: float = 1.0,
|
|
|
|
|
e_score_correction_bias: Optional[torch.Tensor] = None,
|
|
|
|
|
global_num_experts: int = -1,
|
|
|
|
|
expert_map: Optional[torch.Tensor] = None,
|
|
|
|
|
apply_router_weight_on_input: bool = False,
|
|
|
|
|
enable_force_load_balance: bool = False,
|
2026-01-26 14:28:16 +08:00
|
|
|
log2phy: torch.Tensor = None,
|
2025-10-09 14:12:46 +08:00
|
|
|
**kwargs) -> torch.Tensor:
|
2025-12-31 17:06:55 +08:00
|
|
|
zero_expert_num = getattr(layer, "zero_expert_num", 0)
|
|
|
|
|
zero_expert_type = getattr(layer, "zero_expert_type", None)
|
2025-10-15 09:08:31 +08:00
|
|
|
topk_weights, topk_ids = select_experts(
|
2025-10-09 14:12:46 +08:00
|
|
|
hidden_states=x,
|
|
|
|
|
router_logits=router_logits,
|
|
|
|
|
top_k=top_k,
|
|
|
|
|
use_grouped_topk=use_grouped_topk,
|
|
|
|
|
renormalize=renormalize,
|
|
|
|
|
topk_group=topk_group,
|
|
|
|
|
num_expert_group=num_expert_group,
|
|
|
|
|
custom_routing_function=custom_routing_function,
|
|
|
|
|
scoring_func=scoring_func,
|
|
|
|
|
routed_scaling_factor=routed_scaling_factor,
|
|
|
|
|
e_score_correction_bias=e_score_correction_bias,
|
|
|
|
|
global_num_experts=global_num_experts)
|
|
|
|
|
|
2025-12-31 17:06:55 +08:00
|
|
|
if zero_expert_num > 0 and zero_expert_type is not None:
|
|
|
|
|
topk_ids, topk_weights, zero_expert_result = zero_experts_compute(
|
|
|
|
|
expert_indices=topk_ids,
|
|
|
|
|
expert_scales=topk_weights,
|
|
|
|
|
num_experts=global_num_experts,
|
|
|
|
|
zero_expert_type=zero_expert_type,
|
|
|
|
|
hidden_states=x,
|
|
|
|
|
)
|
|
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
topk_weights = topk_weights.to(x.dtype)
|
|
|
|
|
# this is a naive implementation for experts load balance so as
|
|
|
|
|
# to avoid accumulating too much tokens on a single rank.
|
|
|
|
|
# currently it is only activated when doing profile runs.
|
2025-11-24 20:33:56 +08:00
|
|
|
if enable_force_load_balance:
|
2025-12-13 18:59:54 +08:00
|
|
|
random_matrix = torch.rand(topk_ids.size(0),
|
|
|
|
|
global_num_experts,
|
|
|
|
|
device=topk_ids.device)
|
|
|
|
|
topk_ids = torch.argsort(
|
|
|
|
|
random_matrix, dim=1)[:, :topk_ids.size(1)].to(topk_ids.dtype)
|
2025-10-09 14:12:46 +08:00
|
|
|
|
|
|
|
|
moe_comm_method = get_forward_context().moe_comm_method
|
2025-12-31 17:06:55 +08:00
|
|
|
final_hidden_states = moe_comm_method.fused_experts(
|
2025-10-09 14:12:46 +08:00
|
|
|
hidden_states=x,
|
|
|
|
|
w1=layer.w13_weight,
|
|
|
|
|
w2=layer.w2_weight,
|
|
|
|
|
topk_weights=topk_weights,
|
|
|
|
|
topk_ids=topk_ids,
|
|
|
|
|
expert_map=expert_map,
|
|
|
|
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
2025-10-22 11:41:30 +08:00
|
|
|
dynamic_eplb=self.dynamic_eplb,
|
2026-01-26 14:28:16 +08:00
|
|
|
log2phy=log2phy,
|
2025-10-22 11:41:30 +08:00
|
|
|
mc2_mask=kwargs.get("mc2_mask", None))
|
2025-12-31 17:06:55 +08:00
|
|
|
if zero_expert_num > 0 and zero_expert_type is not None:
|
|
|
|
|
final_hidden_states += zero_expert_result
|
|
|
|
|
return final_hidden_states
|
2025-08-30 11:00:35 +08:00
|
|
|
|
|
|
|
|
|
2025-08-26 19:05:23 +08:00
|
|
|
class AscendFusedMoE(FusedMoE):
|
2025-09-17 10:36:43 +08:00
|
|
|
moe_counter = -1
|
2025-12-14 09:34:13 +08:00
|
|
|
gate_stream: Optional[torch.npu.Stream] = None
|
2025-08-26 19:05:23 +08:00
|
|
|
|
2025-09-16 14:13:07 +08:00
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
num_experts = kwargs["num_experts"]
|
|
|
|
|
intermediate_size = kwargs["intermediate_size"]
|
|
|
|
|
|
2025-09-17 10:36:43 +08:00
|
|
|
AscendFusedMoE.moe_counter += 1
|
|
|
|
|
self.moe_instance_id = AscendFusedMoE.moe_counter
|
2025-10-09 14:12:46 +08:00
|
|
|
|
2025-12-15 19:54:23 +08:00
|
|
|
self._expert_map = None
|
2025-10-09 14:12:46 +08:00
|
|
|
self.log2phy = None
|
|
|
|
|
|
|
|
|
|
if self.quant_config is None:
|
|
|
|
|
self.quant_method = AscendUnquantizedFusedMoEMethod(
|
|
|
|
|
self.moe_config)
|
|
|
|
|
else:
|
|
|
|
|
self.quant_method = self.quant_config.get_quant_method(
|
|
|
|
|
self, self.layer_name)
|
|
|
|
|
|
|
|
|
|
assert self.quant_method is not None
|
|
|
|
|
|
2025-08-26 19:05:23 +08:00
|
|
|
self.moe_config.tp_group = get_tp_group()
|
|
|
|
|
self.moe_config.dp_group = get_dp_group()
|
|
|
|
|
self.moe_config.ep_group = get_ep_group()
|
|
|
|
|
self.moe_config.mc2_group = get_mc2_group()
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
self.moe_config.supports_eplb = self.quant_method.supports_eplb
|
2025-09-17 10:36:43 +08:00
|
|
|
ascend_config = get_ascend_config()
|
2025-12-14 09:34:13 +08:00
|
|
|
# flashcommon3 gate stream
|
|
|
|
|
self.multistream_overlap_gate = ascend_config.multistream_overlap_gate
|
|
|
|
|
if self.multistream_overlap_gate and AscendFusedMoE.gate_stream is None:
|
|
|
|
|
AscendFusedMoE.gate_stream = torch.npu.Stream()
|
2025-10-20 15:31:34 +08:00
|
|
|
if self.custom_routing_function is None and self.e_score_correction_bias is not None:
|
|
|
|
|
vllm_config = get_current_vllm_config()
|
|
|
|
|
self.e_score_correction_bias.data = self.e_score_correction_bias.data.to(
|
|
|
|
|
dtype=vllm_config.model_config.dtype)
|
2025-11-21 14:24:35 +08:00
|
|
|
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
# init moe
|
2026-01-15 10:26:44 +08:00
|
|
|
eplb_config = ascend_config.eplb_config
|
2026-01-19 09:23:28 +08:00
|
|
|
self.global_expert_map, self._expert_map, self.log2phy, self.global_redundant_expert_num = init_eplb_config(
|
2026-01-15 10:26:44 +08:00
|
|
|
eplb_config, self.moe_instance_id, self.moe_config)
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
self.global_num_experts = num_experts + self.global_redundant_expert_num
|
2026-01-15 10:26:44 +08:00
|
|
|
self.dynamic_eplb = eplb_config.dynamic_eplb and (self.log2phy
|
|
|
|
|
is not None)
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
self.local_num_experts = (torch.sum(
|
[smoke][bugfix] moe_init_routing_v2 active_expert_range use int type (#5521)
### What this PR does / why we need it?
The float kernel of MOE_init_routing_v2 in the dispatch allgather
operation does not support tensor format for active_expert_range; it
only supports int.
PR5311 To unify the variables `local_num_experts` and
`self.local_num_experts`, `self.local_num_experts` was used
consistently, which led to the subsequent integer type parameter being
converted to a tensor type.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
gsm8k | exact_match,strict-match: ground_truth=0.89 | measured=0.8939 |
success=✅
gsm8k | exact_match,flexible-extract: ground_truth=0.85 | measured=0.856
| success=✅
ceval-valid | acc,none: ground_truth=0.84 | measured=0.8373 | success=✅
Model Parameters:
{'pretrained': 'Qwen/Qwen3-30B-A3B', 'tensor_parallel_size': 2, 'dtype':
'auto', 'trust_remote_code': False, 'max_model_len': 4096,
'gpu_memory_utilization': 0.6, 'enable_expert_parallel': True}
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/45c1ca1ca1ee8fa06df263c8715e8a412ff408d4
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2025-12-31 09:19:04 +08:00
|
|
|
self._expert_map != -1).item() if self._expert_map is not None else
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
self.global_num_experts)
|
|
|
|
|
if self._expert_map is not None:
|
2025-11-21 14:24:35 +08:00
|
|
|
logger.info_once(
|
|
|
|
|
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
|
|
|
|
|
" number of experts: %s/%s. Experts local to global index map:"
|
|
|
|
|
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
|
|
|
|
|
self.global_num_experts,
|
2025-12-15 19:54:23 +08:00
|
|
|
get_compressed_expert_map(self._expert_map))
|
2025-09-17 10:36:43 +08:00
|
|
|
if self.dynamic_eplb:
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
self.moe_load = torch.zeros(self.local_num_experts,
|
2025-10-30 21:39:01 +08:00
|
|
|
dtype=torch.int64).npu()
|
2025-08-26 19:05:23 +08:00
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
self.moe_config.num_experts = self.global_num_experts
|
|
|
|
|
self.moe_config.num_local_experts = self.local_num_experts
|
2026-01-06 17:22:36 +08:00
|
|
|
self.moe_config.global_redundant_expert_num = self.global_redundant_expert_num
|
2025-10-09 14:12:46 +08:00
|
|
|
|
|
|
|
|
moe_quant_params = {
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
"num_experts": self.local_num_experts,
|
2025-10-09 14:12:46 +08:00
|
|
|
"hidden_size": self.hidden_size,
|
|
|
|
|
"intermediate_size_per_partition":
|
|
|
|
|
self.intermediate_size_per_partition,
|
|
|
|
|
"params_dtype": self.params_dtype,
|
|
|
|
|
"weight_loader": self.weight_loader,
|
|
|
|
|
}
|
|
|
|
|
# need full intermediate size pre-sharding for WNA16 act order
|
|
|
|
|
if (self.quant_method.__class__.__name__
|
|
|
|
|
in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
|
|
|
|
|
moe_quant_params["intermediate_size_full"] = intermediate_size
|
2025-10-11 14:04:02 +08:00
|
|
|
self.quant_method.create_weights(layer=self, **moe_quant_params)
|
2025-10-09 14:12:46 +08:00
|
|
|
|
|
|
|
|
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
|
|
|
|
|
2025-11-13 11:02:31 +08:00
|
|
|
setup_moe_comm_method(self.moe_config)
|
|
|
|
|
self.quant_type = self._get_quant_type()
|
|
|
|
|
|
|
|
|
|
def _get_quant_type(self) -> QuantType:
|
|
|
|
|
quant_method = self.quant_method
|
|
|
|
|
if not hasattr(quant_method,
|
|
|
|
|
"quant_method") or quant_method.quant_method is None:
|
|
|
|
|
return QuantType.NONE
|
|
|
|
|
|
|
|
|
|
method = quant_method.quant_method
|
|
|
|
|
|
[Refactor] Quantization Module Refactor (#5738)
### Summary
This PR refactors the `vllm_ascend/quantization` module to improve code
organization, maintainability, and extensibility. The refactoring
introduces a clear separation of concerns with a registry-based scheme
discovery pattern, abstract base classes for quantization schemes, and
dedicated wrapper classes.
### Key Changes
#### 1. **Modular Directory Structure**
| Before | After |
|--------|-------|
| Flat file structure with mixed responsibilities | Organized into
`methods/` subpackage for schemes |
| Single `quant_config.py` (600+ lines) | Separate config files:
`modelslim_config.py`, `compressed_tensors_config.py` |
| `utils.py` with scheme lookup logic | `methods/registry.py` with
decorator-based registration |
#### 2. **Registry-Based Scheme Discovery**
Replaced hardcoded `ASCEND_QUANTIZATION_METHOD_MAP` dictionary with a
decorator-based registry pattern:
```python
# Before: Manual dictionary mapping
ASCEND_QUANTIZATION_METHOD_MAP = {
"W8A8_DYNAMIC": {"linear": AscendW8A8DynamicLinearMethod, ...},
...
}
# After: Decorator-based registration
@register_scheme("W8A8_DYNAMIC", "linear")
class AscendW8A8DynamicLinearMethod(AscendLinearScheme):
...
```
#### 3. **Abstract Base Classes**
Introduced three abstract base classes in `methods/base.py`:
- `AscendLinearScheme` - Base for linear layer quantization
- `AscendMoEScheme` - Base for MoE layer quantization
- `AscendAttentionScheme` - Base for attention layer quantization
#### 4. **Separated Config and Wrapper Classes**
- **Config classes** (`AscendModelSlimConfig`,
`AscendCompressedTensorsConfig`): Handle config parsing and scheme
selection
- **Wrapper classes** (`AscendLinearMethod`, `AscendFusedMoEMethod`,
etc.): Implement vLLM interfaces and delegate to schemes
#### 5. **Cleaner Public API**
```python
# New clean module interface
from vllm_ascend.quantization import (
AscendModelSlimConfig,
AscendCompressedTensorsConfig,
)
from vllm_ascend.quantization.methods import get_scheme_class
```
### Architecture Diagram
```mermaid
classDiagram
direction TB
class QuantizationConfig {
<<vLLM Interface>>
+get_quant_method()
}
class AscendModelSlimConfig {
+quant_description
+get_quant_method()
-create_scheme_for_layer()
}
class AscendCompressedTensorsConfig {
+target_scheme_map
+get_quant_method()
-_get_scheme_from_parts()
}
class AscendLinearMethod {
<<Wrapper>>
+quant_method: AscendLinearScheme
+create_weights()
+apply()
}
class AscendFusedMoEMethod {
<<Wrapper>>
+quant_method: AscendMoEScheme
+create_weights()
+apply()
}
class AscendLinearScheme {
<<Abstract>>
+get_weight()*
+apply()*
+get_pertensor_param()
+get_perchannel_param()
}
class AscendMoEScheme {
<<Abstract>>
+get_weight()*
+get_dynamic_quant_param()*
+apply()*
}
class W8A8DynamicLinear {
+get_weight()
+apply()
}
class W8A8DynamicMoE {
+get_weight()
+apply()
}
QuantizationConfig <|-- AscendModelSlimConfig
QuantizationConfig <|-- AscendCompressedTensorsConfig
AscendModelSlimConfig ..> AscendLinearMethod : creates
AscendModelSlimConfig ..> AscendFusedMoEMethod : creates
AscendCompressedTensorsConfig ..> AscendLinearMethod : creates
AscendCompressedTensorsConfig ..> AscendFusedMoEMethod : creates
AscendLinearMethod o-- AscendLinearScheme : delegates to
AscendFusedMoEMethod o-- AscendMoEScheme : delegates to
AscendLinearScheme <|-- W8A8DynamicLinear
AscendMoEScheme <|-- W8A8DynamicMoE
```
### Scheme Registration Flow
```mermaid
sequenceDiagram
participant Module as Scheme Module
participant Registry as _SCHEME_REGISTRY
participant Config as QuantConfig
participant Wrapper as Wrapper Class
Note over Module: At import time
Module->>Registry: @register_scheme("W8A8_DYNAMIC", "linear")
Registry->>Registry: Store (quant_type, layer_type) -> Class
Note over Config: At runtime
Config->>Config: Determine quant_type from description
Config->>Registry: get_scheme_class(quant_type, layer_type)
Registry-->>Config: Return scheme class
Config->>Config: scheme = scheme_cls()
Config->>Wrapper: Create wrapper with scheme
Wrapper-->>Config: Return wrapper instance
```
### File Changes Summary
| Original Files | Refactored Files |
|----------------|------------------|
| `__init__.py` (empty) | `__init__.py` (exports public API) |
| `quant_config.py` | `modelslim_config.py` + `wrappers.py` |
| `compressed_tensors/` | `compressed_tensors_config.py` |
| `utils.py` | `methods/registry.py` |
| `w8a8_dynamic.py` | `methods/w8a8_dynamic.py` |
| `w8a8.py` | `methods/w8a8_static.py` |
| `w4a4_flatquant_dynamic.py` | `methods/w4a4_flatquant.py` |
| ... | `methods/base.py` (new) |
### Benefits
1. **Extensibility**: Adding new quantization schemes only requires
implementing the base class and adding `@register_scheme` decorator
2. **Maintainability**: Clear separation between config parsing, wrapper
logic, and scheme implementation
3. **Testability**: Abstract base classes enable easier unit testing and
mocking
4. **Discoverability**: Registry pattern makes it easy to list all
supported schemes
5. **Reduced Coupling**: Config classes no longer need to know about all
scheme implementations
___
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
---------
Signed-off-by: SlightwindSec <slightwindsec@gmail.com>
2026-01-23 14:13:47 +08:00
|
|
|
if hasattr(method, "quant_type"):
|
|
|
|
|
from vllm_ascend.quantization.methods.base import \
|
|
|
|
|
QuantType as SchemeQuantType
|
|
|
|
|
scheme_quant_type = method.quant_type
|
|
|
|
|
if scheme_quant_type == SchemeQuantType.W8A8:
|
|
|
|
|
return QuantType.W8A8
|
|
|
|
|
elif scheme_quant_type == SchemeQuantType.W4A8:
|
|
|
|
|
return QuantType.W4A8
|
|
|
|
|
|
|
|
|
|
return QuantType.NONE
|
2025-08-26 19:05:23 +08:00
|
|
|
|
2025-09-17 10:36:43 +08:00
|
|
|
def update_expert_map(self, new_expert_map):
|
2025-12-15 19:54:23 +08:00
|
|
|
self._expert_map = new_expert_map
|
2025-09-17 10:36:43 +08:00
|
|
|
|
|
|
|
|
def get_log2phy_map(self):
|
2025-11-29 15:18:29 +08:00
|
|
|
return self.log2phy
|
2025-09-17 10:36:43 +08:00
|
|
|
|
|
|
|
|
def clear_moe_load(self):
|
|
|
|
|
if self.moe_load is not None:
|
|
|
|
|
self.moe_load.zero_()
|
|
|
|
|
|
2025-09-09 18:19:56 +08:00
|
|
|
def maybe_all_reduce_tensor_model_parallel(
|
|
|
|
|
self, final_hidden_states: torch.Tensor):
|
|
|
|
|
"""NOTE(Yizhou): This is to override the parent class method. In `mc2commimpl`,
|
|
|
|
|
and `alltoallcommimpl`, we do not need to all-reduce the final outputs since
|
|
|
|
|
the outputs are already aggregated across tensor parallel ranks in the
|
|
|
|
|
`finalize` function. In `allgathercommimpl`, we still need to all-reduce the
|
|
|
|
|
outputs since each rank only has partial outputs.
|
|
|
|
|
"""
|
2025-09-28 21:31:55 +08:00
|
|
|
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(
|
|
|
|
|
final_hidden_states)
|
2025-09-09 18:19:56 +08:00
|
|
|
|
2026-01-17 11:53:22 +08:00
|
|
|
def forward_impl( # type: ignore[override]
|
|
|
|
|
self,
|
|
|
|
|
hidden_states: torch.Tensor,
|
|
|
|
|
router_logits: torch.Tensor,
|
|
|
|
|
return_with_event: bool = False) -> torch.Tensor | FusedMoEResult:
|
2025-08-26 19:05:23 +08:00
|
|
|
assert self.quant_method is not None
|
|
|
|
|
|
2025-10-09 14:12:46 +08:00
|
|
|
forward_context = get_forward_context()
|
2025-10-10 09:00:07 +08:00
|
|
|
|
|
|
|
|
# Load balancing for token distribution among experts in dummy_run
|
|
|
|
|
# TODO: The community only considers load balancing when DP > 1.
|
|
|
|
|
# This approach may overlook some extreme scenarios.
|
2025-10-09 14:12:46 +08:00
|
|
|
enable_force_load_balance = forward_context.in_profile_run
|
|
|
|
|
|
2025-08-26 19:05:23 +08:00
|
|
|
forward_context = get_forward_context()
|
2025-12-14 09:34:13 +08:00
|
|
|
if self.multistream_overlap_gate:
|
|
|
|
|
assert AscendFusedMoE.gate_stream is not None
|
|
|
|
|
fc3_context = get_flash_common3_context()
|
|
|
|
|
assert fc3_context is not None
|
|
|
|
|
AscendFusedMoE.gate_stream.wait_stream(torch.npu.current_stream())
|
|
|
|
|
with npu_stream_switch(AscendFusedMoE.gate_stream,
|
|
|
|
|
enabled=self.multistream_overlap_gate):
|
|
|
|
|
# share_expert
|
|
|
|
|
assert fc3_context.shared_experts is not None
|
|
|
|
|
shared_out = fc3_context.shared_experts(hidden_states)
|
|
|
|
|
# NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
|
|
|
|
|
moe_comm_type = forward_context.moe_comm_type
|
2025-12-21 15:23:59 +08:00
|
|
|
if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_MC2} \
|
2025-12-14 09:34:13 +08:00
|
|
|
and not shared_expert_dp_enabled():
|
|
|
|
|
shared_out = tensor_model_parallel_all_reduce(shared_out)
|
|
|
|
|
set_flash_common3_context(shared_out=shared_out)
|
|
|
|
|
|
|
|
|
|
topk_weights, topk_ids = select_experts(
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
|
|
|
|
top_k=self.top_k,
|
|
|
|
|
use_grouped_topk=self.use_grouped_topk,
|
|
|
|
|
renormalize=self.renormalize,
|
|
|
|
|
topk_group=self.topk_group,
|
|
|
|
|
num_expert_group=self.num_expert_group,
|
|
|
|
|
custom_routing_function=self.custom_routing_function,
|
|
|
|
|
scoring_func=self.scoring_func,
|
|
|
|
|
routed_scaling_factor=self.routed_scaling_factor,
|
|
|
|
|
e_score_correction_bias=self.e_score_correction_bias,
|
|
|
|
|
global_num_experts=self.global_num_experts)
|
|
|
|
|
|
|
|
|
|
if isinstance(forward_context.moe_comm_method,
|
|
|
|
|
AllGatherCommImpl):
|
|
|
|
|
topk_weights = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
|
|
|
|
|
topk_weights, True, True)
|
|
|
|
|
topk_ids = torch.ops.vllm.maybe_all_gather_and_maybe_unpad(
|
|
|
|
|
topk_ids, True, True)
|
|
|
|
|
|
|
|
|
|
set_flash_common3_context(topk_weights=topk_weights,
|
|
|
|
|
topk_ids=topk_ids)
|
|
|
|
|
|
2025-10-22 11:41:30 +08:00
|
|
|
hidden_states, router_logits, mc2_mask, context_metadata = forward_context.moe_comm_method.prepare(
|
2025-09-24 11:29:59 +08:00
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
2025-10-09 14:12:46 +08:00
|
|
|
replace_allreduce=forward_context.sp_enabled,
|
2025-11-13 11:02:31 +08:00
|
|
|
enable_shared_expert_dp=self.enable_shared_expert_dp,
|
|
|
|
|
quant_type=self.quant_type)
|
2025-08-26 19:05:23 +08:00
|
|
|
|
2025-12-14 09:34:13 +08:00
|
|
|
# Make sure the default stream waits for the gate stream to finish.
|
|
|
|
|
if self.multistream_overlap_gate:
|
|
|
|
|
torch.npu.current_stream().wait_stream(AscendFusedMoE.gate_stream)
|
|
|
|
|
|
2025-11-04 16:49:58 +08:00
|
|
|
if isinstance(hidden_states, tuple):
|
|
|
|
|
hidden_states, pertoken_scale = hidden_states
|
|
|
|
|
else:
|
|
|
|
|
pertoken_scale = None
|
|
|
|
|
|
2025-08-26 19:05:23 +08:00
|
|
|
# Matrix multiply.
|
2025-12-31 14:24:37 +08:00
|
|
|
fused_experts_results: FusedExpertsResult = self.quant_method.apply(
|
2025-08-26 19:05:23 +08:00
|
|
|
layer=self,
|
|
|
|
|
x=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
2025-11-04 16:49:58 +08:00
|
|
|
pertoken_scale=pertoken_scale,
|
2025-08-26 19:05:23 +08:00
|
|
|
top_k=self.top_k,
|
|
|
|
|
renormalize=self.renormalize,
|
|
|
|
|
use_grouped_topk=self.use_grouped_topk,
|
|
|
|
|
global_num_experts=self.global_num_experts,
|
[EPLB][refactor] Modification of the initialization logic for expert_map and log2phy(depend on pr5285) (#5311)
### What this PR does / why we need it?
Unify the loading logic for expert_map and log2phy.
1. The map generated when enabling the redundancy expert is incorrect.
The community generation map function only accepts the number of global
experts. When we pass in the number of logical experts plus redundant
experts, the local expert ID of the last card will index to an expert ID
that does not exist. Now we ensure that the index points to a real
existing expert ID, and each expert can be accessed. Moreover, when
redundant experts are not enabled, the output of our function remains
consistent with the community's function.
2. The map we generate is based on the length of the physical expert,
but in reality, we only need to use the length of the logical expert.
Later on, we will need to pad it accordingly, so we can simply generate
a map with the length of the logical [expert.]
3. Unify the initialization logic across different scenarios and
simplify the code for fused_moe.
**Before refactoring**
- map path is not None:
expert map: get_rank_placement_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
log2phy: get_rank_log2phy_map from _'expert_load_balancer.py'_,
maintains the map for all ranks and all layers.
- map path is None:
expert map: determine_expert_map from '_vllm.laye_r', The function does
not support the redundant experts of vllm-ascend.
log2phy: determine_default_log2phy_map from _'eplb_utils.py'_. The
function does not support the redundant experts of vllm-ascend.
**Refactoring**
eplb_utils.py
init_eplb_config
generate placement
generate expert map
generate log2phy
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 16
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 1 2 3 4 5 6 7 8
9 10 11 12 13 14 15 16]
+++++++++++++++++++++++++++++++++++++++++
Improved map:
[16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
Expert Mapping Test Generation:
ep size: 16, num of experts: 256, num of redundant experts: 0
+++++++++++++++++++++++++++++++++++++++++
Expert Mapping (Non-1 indicates the expert responsible for this rank)
for Rank 15:
vllm map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+++++++++++++++++++++++++++++++++++++++
Improved map:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
dsr1 baselie:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
dsr1 eplb:
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| gsm8k-lite | 7cd45e | accuracy | gen | 100.00 |
- vLLM version: release/v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/5fbfa8d9ef15948599631baeb91e8220b2ee9bcc
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
Co-authored-by: weijinqian0 <1184188277@qq.com>
2025-12-29 09:26:14 +08:00
|
|
|
expert_map=self._expert_map,
|
2025-08-26 19:05:23 +08:00
|
|
|
topk_group=self.topk_group,
|
|
|
|
|
num_expert_group=self.num_expert_group,
|
|
|
|
|
custom_routing_function=self.custom_routing_function,
|
|
|
|
|
scoring_func=self.scoring_func,
|
2025-12-31 17:06:55 +08:00
|
|
|
routed_scaling_factor=self.routed_scaling_factor,
|
2025-08-26 19:05:23 +08:00
|
|
|
e_score_correction_bias=self.e_score_correction_bias,
|
|
|
|
|
activation=self.activation,
|
|
|
|
|
apply_router_weight_on_input=self.apply_router_weight_on_input,
|
2025-10-09 14:12:46 +08:00
|
|
|
enable_force_load_balance=enable_force_load_balance,
|
|
|
|
|
log2phy=self.log2phy,
|
2025-10-22 11:41:30 +08:00
|
|
|
global_redundant_expert_num=self.global_redundant_expert_num,
|
|
|
|
|
mc2_mask=mc2_mask)
|
2025-10-09 14:12:46 +08:00
|
|
|
|
2025-12-31 14:24:37 +08:00
|
|
|
if self.dynamic_eplb:
|
|
|
|
|
expert_tokens = fused_experts_results.expert_tokens
|
|
|
|
|
group_list_type = fused_experts_results.group_list_type
|
|
|
|
|
assert expert_tokens is not None and group_list_type is not None, \
|
|
|
|
|
"expert_tokens and group_list_type should not be None when dynamic_eplb is enabled."
|
2026-01-26 17:18:46 +08:00
|
|
|
local_load = expert_tokens if group_list_type == 1 else \
|
2026-01-07 11:26:47 +08:00
|
|
|
torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
|
2026-01-26 17:18:46 +08:00
|
|
|
self.moe_load.add_(local_load)
|
2025-12-31 14:24:37 +08:00
|
|
|
routed_out = forward_context.moe_comm_method.finalize(
|
|
|
|
|
hidden_states=fused_experts_results.routed_out,
|
2025-10-22 11:41:30 +08:00
|
|
|
reduce_results=self.reduce_results,
|
|
|
|
|
context_metadata=context_metadata)
|
2025-08-26 19:05:23 +08:00
|
|
|
|
2026-01-17 11:53:22 +08:00
|
|
|
if return_with_event:
|
|
|
|
|
return FusedMoEResult(
|
|
|
|
|
routed_out=routed_out,
|
|
|
|
|
before_dispatch_evt=fused_experts_results.before_dispatch_evt,
|
|
|
|
|
before_combine_evt=fused_experts_results.before_combine_evt)
|
|
|
|
|
else:
|
|
|
|
|
# The vLLM FusedMoE forward_impl does not return events.
|
|
|
|
|
return routed_out
|
2025-08-26 19:05:23 +08:00
|
|
|
|
|
|
|
|
|
2025-09-19 19:05:01 +08:00
|
|
|
class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
|
2025-09-09 18:19:56 +08:00
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
shared_experts: torch.nn.Module,
|
2025-10-25 15:36:32 +08:00
|
|
|
gate: Optional[torch.nn.Module] = None,
|
2025-09-09 18:19:56 +08:00
|
|
|
use_overlapped: bool = True,
|
2026-02-05 19:31:17 +08:00
|
|
|
routed_input_transform: Optional[torch.nn.Module] = None,
|
2025-09-09 18:19:56 +08:00
|
|
|
**kwargs,
|
|
|
|
|
):
|
2025-09-19 19:05:01 +08:00
|
|
|
AscendFusedMoE.__init__(self, **kwargs)
|
2025-10-25 15:36:32 +08:00
|
|
|
|
2026-02-05 19:31:17 +08:00
|
|
|
if not vllm_version_is("0.15.0"):
|
|
|
|
|
self._routed_input_transform = routed_input_transform
|
2025-09-09 18:19:56 +08:00
|
|
|
self._shared_experts = shared_experts
|
|
|
|
|
self.use_overlapped = use_overlapped
|
2025-09-19 11:06:45 +08:00
|
|
|
self.shared_expert_stream = None
|
|
|
|
|
ascend_config = get_ascend_config()
|
2026-01-29 08:47:20 +08:00
|
|
|
self.multistream_overlap_shared_expert = \
|
|
|
|
|
ascend_config.multistream_overlap_shared_expert and \
|
|
|
|
|
self._shared_experts is not None
|
|
|
|
|
self.multistream_overlap_gate = \
|
|
|
|
|
ascend_config.multistream_overlap_gate and \
|
|
|
|
|
self._shared_experts is not None
|
2025-10-15 19:36:32 +08:00
|
|
|
if enable_sp():
|
|
|
|
|
logger.info_once(
|
|
|
|
|
"Sequence parallelism is enabled, shared experts are replicated for best performance."
|
|
|
|
|
)
|
2025-09-09 18:19:56 +08:00
|
|
|
|
2025-10-25 15:36:32 +08:00
|
|
|
self._gate = gate
|
|
|
|
|
|
2026-01-29 08:47:20 +08:00
|
|
|
if self.multistream_overlap_shared_expert:
|
|
|
|
|
# Wrap the quant_method's process_weights_after_loading to validate that
|
|
|
|
|
# splitting shared expert computation (gate_up projection + activation,
|
|
|
|
|
# then down projection) yields identical results to integrated
|
|
|
|
|
# computation after weight loading.
|
|
|
|
|
original_process_weights = self.quant_method.process_weights_after_loading
|
2026-01-17 11:53:22 +08:00
|
|
|
|
2026-01-29 08:47:20 +08:00
|
|
|
@wraps(original_process_weights)
|
|
|
|
|
def wrapped_process_weights(*args, **kwargs):
|
|
|
|
|
result = original_process_weights(*args, **kwargs)
|
|
|
|
|
self._validate_shared_expert_consistency()
|
|
|
|
|
return result
|
2026-01-17 11:53:22 +08:00
|
|
|
|
2026-01-29 08:47:20 +08:00
|
|
|
self.quant_method.process_weights_after_loading = wrapped_process_weights # type: ignore
|
2026-01-17 11:53:22 +08:00
|
|
|
|
|
|
|
|
def _shared_experts_part1(self, hidden_states: torch.Tensor):
|
|
|
|
|
shared_gate_up, _ = self._shared_experts.gate_up_proj(
|
|
|
|
|
hidden_states) # type: ignore
|
|
|
|
|
return shared_gate_up
|
|
|
|
|
|
|
|
|
|
def _shared_experts_part2(self, hidden_states: torch.Tensor,
|
|
|
|
|
shared_gate_up: torch.Tensor):
|
|
|
|
|
shared_act = self._shared_experts.act_fn(
|
|
|
|
|
shared_gate_up) # type: ignore
|
|
|
|
|
shared_out, _ = self._shared_experts.down_proj(
|
|
|
|
|
shared_act) # type: ignore
|
|
|
|
|
|
|
|
|
|
# Qwen3-Next specific gating mechanism
|
|
|
|
|
if hasattr(self._shared_experts, "expert_gate") and \
|
|
|
|
|
self._shared_experts.expert_gate is not None:
|
2026-01-23 09:45:08 +08:00
|
|
|
gate_out, _ = self._shared_experts.expert_gate(hidden_states) # type: ignore
|
2026-01-17 11:53:22 +08:00
|
|
|
shared_out = F.sigmoid(gate_out) * shared_out
|
|
|
|
|
return shared_out
|
|
|
|
|
|
|
|
|
|
def _validate_shared_expert_consistency(self):
|
|
|
|
|
"""Validate that split shared expert computation matches integrated
|
|
|
|
|
computation."""
|
|
|
|
|
test_input = torch.rand(
|
|
|
|
|
10, self.hidden_size, device='npu', dtype=self.moe_config.in_dtype
|
|
|
|
|
) * 2 - 1 # Random input for testing, scoped to [-1, 1]
|
|
|
|
|
|
|
|
|
|
integrated_out = self._shared_experts(test_input)
|
|
|
|
|
part1_out = self._shared_experts_part1(test_input)
|
|
|
|
|
split_out = self._shared_experts_part2(test_input, part1_out)
|
|
|
|
|
|
|
|
|
|
if not torch.allclose(integrated_out, split_out):
|
|
|
|
|
diff = (integrated_out - split_out).abs()
|
|
|
|
|
logger.error(
|
|
|
|
|
"SharedFusedMoE shared experts split computation does not "
|
|
|
|
|
"match the integrated computation.")
|
|
|
|
|
logger.error(f"Max absolute difference: {diff.max().item()}")
|
|
|
|
|
logger.error("Integrated output - sum: %s, norm: %s",
|
|
|
|
|
integrated_out.sum().item(),
|
|
|
|
|
integrated_out.norm().item())
|
|
|
|
|
logger.error("Split output - sum: %s, norm: %s",
|
|
|
|
|
split_out.sum().item(),
|
|
|
|
|
split_out.norm().item())
|
|
|
|
|
raise ValueError(
|
|
|
|
|
"SharedFusedMoE shared experts split computation does not "
|
|
|
|
|
"match the integrated computation.")
|
|
|
|
|
logger.info_once(
|
|
|
|
|
"SharedFusedMoE shared experts split computation matches the "
|
|
|
|
|
"integrated computation.")
|
|
|
|
|
|
2025-10-25 15:36:32 +08:00
|
|
|
@property
|
|
|
|
|
def gate(self) -> Optional[torch.nn.Module]:
|
|
|
|
|
return self._gate if self.use_overlapped else None
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def is_internal_router(self) -> bool:
|
|
|
|
|
return False
|
|
|
|
|
|
2025-11-26 11:48:58 +08:00
|
|
|
@property
|
|
|
|
|
def use_dp_chunking(self) -> bool:
|
|
|
|
|
"""This func routes to the chunked forward path using the FlashInfer Cutlass kernel
|
|
|
|
|
only when data parallelism (DP) is enabled. Thus just returning False in vllm-ascend
|
|
|
|
|
"""
|
|
|
|
|
return False
|
|
|
|
|
|
2025-09-09 18:19:56 +08:00
|
|
|
def forward(
|
|
|
|
|
self,
|
|
|
|
|
hidden_states: torch.Tensor,
|
|
|
|
|
router_logits: torch.Tensor,
|
|
|
|
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
2026-02-02 15:57:55 +08:00
|
|
|
if self._shared_experts is None:
|
|
|
|
|
fused_out = AscendFusedMoE.forward(
|
|
|
|
|
self,
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
|
|
|
|
)
|
|
|
|
|
shared_out = None
|
|
|
|
|
return shared_out, fused_out
|
2025-09-28 21:31:55 +08:00
|
|
|
shared_out, fused_out = AscendFusedMoE.forward(
|
|
|
|
|
self,
|
|
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
|
|
|
|
)
|
|
|
|
|
return shared_out, fused_out
|
|
|
|
|
|
2026-01-17 11:53:22 +08:00
|
|
|
def _forward_shared_experts(self, hidden_states: torch.Tensor,
|
|
|
|
|
fused_moe_evts: FusedMoEEvents):
|
2026-01-29 08:47:20 +08:00
|
|
|
if self._shared_experts is None:
|
|
|
|
|
return None
|
2026-01-17 11:53:22 +08:00
|
|
|
|
|
|
|
|
def maybe_wait_event(evt: torch.npu.Event | None):
|
|
|
|
|
if evt is not None:
|
|
|
|
|
torch.npu.current_stream().wait_event(evt)
|
|
|
|
|
|
|
|
|
|
with npu_stream_switch(shared_experts_calculation_stream(),
|
|
|
|
|
enabled=self.multistream_overlap_shared_expert):
|
|
|
|
|
# Ensure the shared experts wait for hidden_states to be ready.
|
|
|
|
|
torch.npu.current_stream().wait_event(
|
|
|
|
|
fused_moe_evts.before_routed_experts)
|
|
|
|
|
# Execute the gate projection and activation concurrently with the
|
|
|
|
|
# dispatch communication.
|
|
|
|
|
maybe_wait_event(fused_moe_evts.before_dispatch)
|
|
|
|
|
part1_out = self._shared_experts_part1(hidden_states)
|
|
|
|
|
# Execute the down projection concurrently with the combine
|
|
|
|
|
# communication.
|
|
|
|
|
maybe_wait_event(fused_moe_evts.before_combine)
|
|
|
|
|
shared_out = self._shared_experts_part2(hidden_states, part1_out)
|
|
|
|
|
|
|
|
|
|
# Make sure the default stream waits for the shared experts stream to
|
|
|
|
|
# finish.
|
|
|
|
|
if self.multistream_overlap_shared_expert:
|
|
|
|
|
torch.npu.current_stream().wait_stream(
|
|
|
|
|
shared_experts_calculation_stream())
|
|
|
|
|
|
|
|
|
|
# NOTE: This is exactly the opposite of
|
|
|
|
|
# `maybe_all_reduce_tensor_model_parallel`
|
|
|
|
|
forward_context = get_forward_context()
|
|
|
|
|
moe_comm_type = forward_context.moe_comm_type
|
|
|
|
|
if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_MC2} \
|
|
|
|
|
and not shared_expert_dp_enabled():
|
|
|
|
|
shared_out = tensor_model_parallel_all_reduce(shared_out)
|
|
|
|
|
return shared_out
|
|
|
|
|
|
|
|
|
|
def forward_impl( # type: ignore[override]
|
|
|
|
|
self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
|
|
|
|
|
if self.multistream_overlap_gate:
|
2025-12-14 09:34:13 +08:00
|
|
|
set_flash_common3_context(shared_experts=self._shared_experts)
|
2025-09-19 11:06:45 +08:00
|
|
|
|
2026-01-17 11:53:22 +08:00
|
|
|
before_routed_experts = torch.npu.current_stream().record_event()
|
|
|
|
|
fused_moe_results = AscendFusedMoE.forward_impl(
|
2025-09-19 19:05:01 +08:00
|
|
|
self,
|
2025-09-09 18:19:56 +08:00
|
|
|
hidden_states=hidden_states,
|
|
|
|
|
router_logits=router_logits,
|
2026-01-17 11:53:22 +08:00
|
|
|
return_with_event=True,
|
2025-09-09 18:19:56 +08:00
|
|
|
)
|
2026-01-17 11:53:22 +08:00
|
|
|
routed_out = fused_moe_results.routed_out
|
2025-12-14 09:34:13 +08:00
|
|
|
|
2026-02-02 15:57:55 +08:00
|
|
|
if self._shared_experts is None:
|
|
|
|
|
return routed_out
|
|
|
|
|
|
2026-01-17 11:53:22 +08:00
|
|
|
if self.multistream_overlap_gate:
|
2025-12-14 09:34:13 +08:00
|
|
|
fc3_context = get_flash_common3_context()
|
|
|
|
|
assert fc3_context is not None
|
|
|
|
|
shared_out = fc3_context.shared_out
|
2026-01-17 11:53:22 +08:00
|
|
|
else:
|
|
|
|
|
shared_out = self._forward_shared_experts(
|
|
|
|
|
hidden_states,
|
|
|
|
|
FusedMoEEvents(
|
|
|
|
|
before_routed_experts=before_routed_experts,
|
|
|
|
|
before_dispatch=fused_moe_results.before_dispatch_evt,
|
|
|
|
|
before_combine=fused_moe_results.before_combine_evt,
|
|
|
|
|
))
|
2025-12-14 09:34:13 +08:00
|
|
|
|
2025-12-31 14:24:37 +08:00
|
|
|
return shared_out, routed_out
|