support deepseek quant & mix-parallel with graphmode (#585)

### What this PR does / why we need it? 1. support deepseek with w8a8 quant; 2. support deepseek with mix-parallel(multi-DP, EP+TP); 3. support deepseek with graphmode. --------- Signed-off-by: wen-jie666 <wenjie39@huawei.com> Signed-off-by: Yizhou Liu <liuyizhou5@h-partners.com> Signed-off-by: libaokui <libaokui@huawei.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: wen-jie666 <wenjie39@huawei.com>
2025-04-23 16:23:25 +08:00
parent e74331a1ed
commit 5c6d05a59e
13 changed files with 520 additions and 221 deletions
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -15,14 +15,183 @@
 # limitations under the License.
 #

+import os
 from typing import Any, Callable, Dict, Optional

 import torch
 import torch_npu

+from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import select_experts


+def apply_mlp(x: torch.Tensor,
+              w1: torch.Tensor,
+              w1_scale: torch.Tensor,
+              w2: torch.Tensor,
+              w2_scale: torch.Tensor,
+              group_list: torch.Tensor,
+              dynamic_scale: torch.Tensor = None,
+              group_list_type: int = 1) -> torch.Tensor:
+    """
+    apply MLP: gate_up_proj -> swiglu -> down_proj
+
+    Args:
+        x: input hidden states with shape (num_tokens, hidden_size).
+        w1: expert weights1 with shape
+            (num_experts, hidden_size, intermediate_size * 2)
+        w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2)
+        w2: expert weights2 with shape
+            (num_experts, intermediate_size, hidden_size)
+        w2_scale: weights2 scale with shape (num_experts, hidden_size)
+        group_list: number of tokens for each expert, follow cumsum mode, and
+            with shape (num_experts).
+        transpose_weight:
+            w1: (num_experts, intermediate_size * 2, hidden_size) ->
+                    (num_experts, hidden_size, intermediate_size * 2)
+            w2: (num_experts, hidden_size, intermediate_size) ->
+                    (num_experts, intermediate_size, hidden_size)
+
+    Returns:
+        hidden_states: output hidden states after MLP.
+    """
+
+    if dynamic_scale is None:
+        h, pertoken_scale = torch_npu.npu_dynamic_quant(x)
+    else:
+        h = x
+        pertoken_scale = dynamic_scale
+
+    output_dtype = torch.bfloat16 if w1_scale.dtype == torch.bfloat16 else \
+        torch.float16
+
+    # gmm1: gate_up_proj
+    gate_up_out_list = torch_npu.npu_grouped_matmul(
+        x=[h],
+        weight=[w1],
+        scale=[w1_scale],
+        per_token_scale=[pertoken_scale],
+        split_item=3,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+        output_dtype=output_dtype)
+    gate_up_out = gate_up_out_list[0]
+
+    # swiglu
+    swiglu_out = torch_npu.npu_swiglu(gate_up_out)
+    swiglu_out, swiglu_out_scale = torch_npu.npu_dynamic_quant(swiglu_out)
+
+    # down_proj
+    down_out_list = torch_npu.npu_grouped_matmul(
+        x=[swiglu_out],
+        weight=[w2],
+        scale=[w2_scale],
+        per_token_scale=[swiglu_out_scale],
+        split_item=3,
+        group_list_type=group_list_type,
+        group_type=0,
+        group_list=group_list,
+        output_dtype=output_dtype)
+    return down_out_list[0]
+
+
+def fused_experts_with_mc2(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    moe_all_to_all_group_name: str = "",
+) -> torch.Tensor:
+    global_bs = 0
+    moe_expert_num = len(expert_map)
+    # hidden_states = hidden_states.bfloat16()
+    kwargs = {
+        "x": hidden_states,
+        "expert_ids": topk_ids,
+        "expert_shard_type": 0,
+        "shared_expert_rank_num": 0,
+        "moe_expert_num": moe_expert_num,
+        "global_bs": global_bs,
+    }
+
+    rank = torch.distributed.get_rank()
+
+    quant_mode = 2
+    ep_group = get_ep_group().device_group
+    local_rank = torch.distributed.get_rank(group=ep_group)
+    all_to_all_group_size = torch.distributed.get_world_size(ep_group)
+
+    world_szie = torch.distributed.get_world_size()
+    tp_size = world_szie // all_to_all_group_size
+    tp_rank = rank % tp_size
+
+    stage1_kwargs = {
+        "scales": None,
+        "quant_mode": quant_mode,
+        "group_ep": moe_all_to_all_group_name,
+        "ep_world_size": all_to_all_group_size,
+        "ep_rank_id": local_rank,
+        # "group_tp": self.moe_rs_group_name,
+        "group_tp": moe_all_to_all_group_name,
+        "tp_world_size": tp_size,
+        "tp_rank_id": tp_rank,
+    }
+    kwargs.update(stage1_kwargs)
+
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs)
+    # comm_stream.wait_stream(torch.npu.current_stream())
+    expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
+        0:5]
+
+    if quant_mode == 0:
+        dynamic_scale = None
+
+    down_out_list = apply_mlp(expand_x,
+                              w1,
+                              w1_scale,
+                              w2,
+                              w2_scale,
+                              expert_token_nums,
+                              dynamic_scale=dynamic_scale)
+
+    # moeCombine
+    kwargs = {
+        "expand_x": down_out_list,
+        "expert_ids": topk_ids,
+        "expand_idx": expand_idx,
+        "expert_scales": topk_weights.to(torch.float32),
+        "expert_shard_type": 0,
+        "shared_expert_rank_num": 0,
+        "moe_expert_num": moe_expert_num,
+        "global_bs": 0,
+    }
+    tp_recv_counts = torch.empty(1,
+                                 dtype=torch.int32,
+                                 device=hidden_states.device)
+    stage3_kwargs = {
+        "ep_send_counts": ep_recv_counts,
+        "group_ep": moe_all_to_all_group_name,
+        "ep_world_size": all_to_all_group_size,
+        "ep_rank_id": local_rank,
+        "tp_send_counts": tp_recv_counts,
+        # "group_tp": self.moe_rs_group_name,
+        "group_tp": moe_all_to_all_group_name,
+        "tp_world_size": tp_size,
+        "tp_rank_id": tp_rank,
+    }
+    kwargs.update(stage3_kwargs)
+
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
+
+    return hidden_states
+
+
 def fused_experts(hidden_states: torch.Tensor,
                  w1: torch.Tensor,
                  w1_scale: torch.Tensor,
@@ -75,11 +244,10 @@ def fused_experts(hidden_states: torch.Tensor,
                                   dtype=torch.int64)
        ones = torch.ones_like(filtered_experts, dtype=torch.int64)
        token_counts.scatter_add_(0, filtered_experts.to(torch.int64), ones)
-        token_counts = token_counts[:num_experts]
-        expert_tokens = torch.cumsum(token_counts, dim=0, dtype=torch.int64)
-
+        expert_tokens = token_counts[:num_experts]
        # Rearrange hidden_states
        sorted_hidden_states = hidden_states[sorted_token_indices]
+        group_list_type = 1
    else:
        row_idx_len = num_tokens * top_k
        row_idx = torch.arange(0,
@@ -97,46 +265,15 @@ def fused_experts(hidden_states: torch.Tensor,
        expert_tokens = torch_npu.npu_moe_compute_expert_tokens(
            expanded_expert_idx, num_experts)
        expert_tokens = expert_tokens.to(torch.int64)
+        group_list_type = 0

-    quant_x, x_dynamic_scale = torch_npu.npu_dynamic_quant(
-        sorted_hidden_states)
-    del sorted_hidden_states
-    output_dtype = torch.bfloat16 if w1_scale.dtype == torch.bfloat16 else torch.float16
-
-    gate_up_out_list = torch_npu.npu_grouped_matmul(
-        x=[quant_x],
-        weight=[w1],
-        scale=[w1_scale],
-        per_token_scale=[x_dynamic_scale],
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=expert_tokens,
-        output_dtype=output_dtype)
-    del quant_x
-
-    gate_up_out_list = gate_up_out_list[0] if len(
-        gate_up_out_list) == 1 else torch.cat(gate_up_out_list, dim=0)
-    gate_up_out_list = torch_npu.npu_swiglu(gate_up_out_list)
-
-    quant_gate_up_out_list, gate_up_out_dynamic_scale = torch_npu.npu_dynamic_quant(
-        gate_up_out_list)
-    del gate_up_out_list
-
-    down_out_list = torch_npu.npu_grouped_matmul(
-        x=[quant_gate_up_out_list],
-        weight=[w2],
-        scale=[w2_scale],
-        per_token_scale=[gate_up_out_dynamic_scale],
-        split_item=2,
-        group_list_type=0,
-        group_type=0,
-        group_list=expert_tokens,
-        output_dtype=output_dtype)
-    del quant_gate_up_out_list
-
-    down_out_list = down_out_list[0] if len(down_out_list) == 1 else torch.cat(
-        down_out_list, dim=0)
+    down_out_list = apply_mlp(sorted_hidden_states,
+                              w1,
+                              w1_scale,
+                              w2,
+                              w2_scale,
+                              expert_tokens,
+                              group_list_type=group_list_type)

    if expert_map is not None:
        weighted_down_out = down_out_list * sorted_weights.unsqueeze(1)
@@ -144,12 +281,18 @@ def fused_experts(hidden_states: torch.Tensor,
        final_hidden_states = torch.zeros(*original_shape,
                                          device=hidden_states.device,
                                          dtype=dtype)
-        final_hidden_states.index_add_(0, sorted_token_indices,
-                                       weighted_down_out)
-        # TODO: This should not happen! Look into it!
-        # fill nan with 0.0
-        final_hidden_states[torch.isnan(final_hidden_states)] = 0.0
+
+        num_valid_tokens = mask.sum()
+        valid_token_mask = torch.arange(
+            0, sorted_token_indices.shape[0],
+            device=device).unsqueeze(1) < num_valid_tokens
+        valid_output = torch.where(
+            valid_token_mask, weighted_down_out,
+            torch.zeros_like(weighted_down_out)).to(dtype)
+        final_hidden_states.index_add_(0, sorted_token_indices, valid_output)
    else:
+        # TODO: Reorder device memory 2 times here, replace the current
+        # implementation here when suitable operators become available.
        final_hidden_states = torch_npu.npu_moe_finalize_routing(
            down_out_list,
            skip1=None,
@@ -157,7 +300,8 @@ def fused_experts(hidden_states: torch.Tensor,
            bias=None,
            scales=topk_weights,
            expanded_src_to_dst_row=expanded_row_idx,
-            export_for_source_row=topk_ids)
+            export_for_source_row=topk_ids,
+        )
    del down_out_list
    if len(original_shape) == 3:
        final_hidden_states = final_hidden_states.view(original_shape)
@@ -230,6 +374,18 @@ class AscendW8A8DynamicFusedMoEMethod:
    def __init__(self):
        self.transpose_weight = True

+        ep_group = get_ep_group()
+
+        try:
+            device_group = ep_group.device_group
+            # TODO: Try local_rank = ep_group.rank_in_group
+            local_rank = torch.distributed.get_rank(group=device_group)
+            backend = device_group._get_backend(torch.device("npu"))
+            self.moe_all_to_all_group_name = backend.get_hccl_comm_name(
+                local_rank)
+        except AttributeError:
+            self.moe_all_to_all_group_name = ""
+
    @staticmethod
    def get_weight(num_experts: int, intermediate_size_per_partition: int,
                   hidden_sizes: int,
@@ -272,48 +428,78 @@ class AscendW8A8DynamicFusedMoEMethod:
                                                     dtype=params_dtype)
        return param_dict

-    @staticmethod
    def apply(
+        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        renormalize: bool,
        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
+        is_prefill: bool = True,
        **kwargs,
    ) -> torch.Tensor:
        assert router_logits.shape[
            1] == global_num_experts, "Number of global experts mismatch"

-        topk_weights, topk_ids = select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            top_k=top_k,
-            use_grouped_topk=use_grouped_topk,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias,
-        )
+        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
+        if global_num_experts == 256:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+                router_logits,
+                k=top_k,  # topk当前写8
+                bias=e_score_correction_bias,
+                k_group=topk_group,  # fix: 4
+                group_count=num_expert_group,  # fix 8
+                group_select_mode=1,  # 0: group中的最大; 1: topk2.sum(fix)
+                renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+                norm_type=1,  # 0: softmax; 1: sigmoid(fix)
+                # out_flag=False, # todo new api; 第三个输出是否输出
+                # y2_flag=False, # old api; 第三个输出是否输出
+                routed_scaling_factor=1,
+                eps=float(1e-20))
+        else:
+            topk_weights, topk_ids = select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                use_grouped_topk=use_grouped_topk,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias,
+            )

-        return fused_experts(hidden_states=x,
-                             w1=layer.w13_weight,
-                             w1_scale=layer.w13_weight_scale,
-                             w2=layer.w2_weight,
-                             w2_scale=layer.w2_weight_scale,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids,
-                             top_k=top_k,
-                             expert_map=expert_map)
+        if os.environ.get("VLLM_ENABLE_MC2", '0') == "1" and not is_prefill:
+            return fused_experts_with_mc2(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map,
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
+        else:
+            return fused_experts(hidden_states=x,
+                                 w1=layer.w13_weight,
+                                 w1_scale=layer.w13_weight_scale,
+                                 w2=layer.w2_weight,
+                                 w2_scale=layer.w2_weight_scale,
+                                 topk_weights=topk_weights,
+                                 topk_ids=topk_ids,
+                                 top_k=top_k,
+                                 expert_map=expert_map)

    def process_weights_after_loading(self, layer):
        if self.transpose_weight: