add llama4

2026-02-11 17:25:38 +08:00
parent 7b4f7d74c3
commit dd221f3084
2 changed files with 21 additions and 0 deletions
--- a/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/layers/sparse_moe_mlp.py
+++ b/vllm-v0.6.2/vllm_mlu/vllm_mlu/model_executor/layers/sparse_moe_mlp.py
@@ -73,6 +73,24 @@ class SparseMoeMlp(nn.Module):
        self.expert_group = expert_group
        self.topk_group = topk_group
        if get_device_major_capability() == 3:
+            # WARNING: MLU370 (capability=3) 不支持 fused_moe 算子，强制关闭。
+            #
+            # 背景：原始 forward_experts_nofused 包含 torch.unique、torch.tensor([0], ...)、
+            # 数据依赖分支等 graph capture 不兼容操作，导致 MLU370 上所有走 SparseMoeMlp
+            # 的 MoE 模型必须加 --enforce-eager 才能运行。当前已将 forward_experts_nofused
+            # 改为 dense 模式（每个 expert 处理全部 token，用路由权重 mask），解决了
+            # graph capture 兼容性问题，所有 MoE 模型无需 --enforce-eager 即可运行。
+            #
+            # 性能代价：dense 模式计算量为 O(num_experts * num_tokens)，相比稀疏路由的
+            # O(topk * num_tokens) 增大了 num_experts/topk 倍。prefill 阶段对 expert
+            # 数量多的模型会明显变慢，decode 阶段（token 少）影响可忽略。
+            # 已知受影响模型：Mixtral (8)、Qwen2-MoE (60)、HunYuan (16)、Llama4 (16) 等。
+            # DeepSeek V2/V3 不受影响（有独立的 MLU MoE hijack 实现）。
+            #
+            # TODO: MLU370 已有完整的 MoE 算子链（moe_gen_idx、moe_expand_input、
+            # group_gemm、moe_active、moe_combine_result），与 forward_group_experts
+            # 使用的算子相同。后续应拆分 is_use_fused_moe 标志，让 MLU370 走
+            # forward_group_experts 路径以避免 dense 模式的性能开销。
            self.is_use_fused_moe = False

        if params_dtype is None: