[Bugfix] [MoE] fix error in deepseek when using allgather (#3824)
### What this PR does / why we need it? After refactoring vllm_ascend/models and FusedMoE, we are unable to pass `gate` from deepseekv2.py to `AscendFusedMoE.forward`, which will result in error when running deepseek v3/r1 with allgather. Hence, this pr removes `gate` related computations from FusedMoE module in eager/aclgraph mode. ### Does this PR introduce _any_ user-facing change? `rm_router_logits` is deprecated in eager/aclgraph. ### How was this patch tested? e2e & ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com>
This commit is contained in:
@@ -48,12 +48,12 @@ from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
|
||||
from vllm_ascend.torchair.ops.sequence_parallel import MetadataForPadding
|
||||
from vllm_ascend.torchair.utils import (npu_stream_switch, npu_wait_tensor,
|
||||
from vllm_ascend.torchair.utils import (get_all_reduce_merge_state,
|
||||
get_rm_router_logits_state,
|
||||
npu_stream_switch, npu_wait_tensor,
|
||||
super_kernel)
|
||||
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
|
||||
get_all_reduce_merge_state,
|
||||
get_ascend_soc_version,
|
||||
get_rm_router_logits_state, is_310p,
|
||||
get_ascend_soc_version, is_310p,
|
||||
is_hierarchical_communication_enabled,
|
||||
vllm_version_is)
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ try:
|
||||
except ImportError:
|
||||
from torchair.ops import NpuStreamSwitch as _npu_stream_switch
|
||||
from torchair.ops import npu_wait_tensor as _npu_wait_tensor
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
|
||||
|
||||
KV_CACHE_BYTES_CACHE_PATH_NAME = ".kv_cache_bytes"
|
||||
@@ -238,3 +240,33 @@ def torchair_ops_patch():
|
||||
|
||||
def super_kernel(prefix: str, option: str, enabled: bool = True):
|
||||
return _super_kernel(prefix, option) if enabled else nullcontext()
|
||||
|
||||
|
||||
# TODO(ttanzhiqiang): rm_router_logits
|
||||
# dp>1 will trigger
|
||||
# In theory, this solution is only applicable to AllGather and AllGatherEP, because in the dp scenario, the previous operation was gate + two communications, and now it is changed to one communication + gate operation, which can save some communication time. In theory, all moe AllGather and AllGatherEP solutions can follow this logic, but now other moe models (qwen3-235b) dp solutions are not adjusted, so use the switch to control it to prevent code errors.
|
||||
def get_rm_router_logits_state(ep_size: int, dp_size: int,
|
||||
is_deepseek_v3_r1: bool):
|
||||
# the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
|
||||
# only supports deepseek v3/r1
|
||||
if dp_size > 1:
|
||||
if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
|
||||
and is_deepseek_v3_r1):
|
||||
return True
|
||||
elif ep_size == 1 and is_deepseek_v3_r1:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# TODO(ttanzhiqiang): all_reduce merge
|
||||
# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
|
||||
# Currently, all_reduce_merge is enabled by default in the AllGather, AllGatherEP and NaiveMulticast scenarios of the deepseek model.
|
||||
def get_all_reduce_merge_state(ep_size: int, is_deepseek_v3_r1: bool):
|
||||
# the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
|
||||
# only supports deepseek v3/r1
|
||||
if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
|
||||
and is_deepseek_v3_r1):
|
||||
return True
|
||||
elif ep_size == 1 and is_deepseek_v3_r1:
|
||||
return True
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user