[EPLB][Ops] Integerate grouped_matmul_swiglu_quant_weight_nz_tensor_list operator into dynamic EPLB (#4216)

### What this PR does / why we need it? Integerate grouped_matmul_swiglu_quant_weight_nz_tensor_list into dynamic EPLB to support list-type parameters This PR also modify the logic of loading model in dynamic-eplb scenario. The operator is based on this pr: https://github.com/vllm-project/vllm-ascend/pull/3804 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ``` vllm serve /home/weight/DeepSeek-V3.1_w8a8mix_mtp \ --max_num_seqs 8 \ --max-model-len 8192 \ --max-num-batched-tokens 16384 \ --tensor-parallel-size 8 \ --data-parallel-size 2 \ --enable-expert-parallel \ --served-model-name ds_r1 \ --enable-auto-tool-choice \ --tool-call-parser hermes \ --no-enable-prefix-caching \ --port 8999 \ --quantization "ascend" \ --gpu-memory-utilization 0.85 \ --trust-remote-code \ --compilation_config '{"cudagraph_capture_sizes":[1,2,4,8,16,32]}' \ --additional-config='{"dynamic_eplb":true, "num_iterations_eplb_update":100, "num_wait_worker_iterations":100}' ``` input&output: 2k 2k This PR: <img width="1318" height="695" alt="fusion" src="https://github.com/user-attachments/assets/f8657813-0c02-42f4-8396-d99e730f48cd" /> Baseline: <img width="1323" height="690" alt="baseline" src="https://github.com/user-attachments/assets/e1323a78-af26-4523-820c-e20e5642a38e" /> - vLLM version: v0.11.2 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.2 --------- Signed-off-by: 白永斌 <baiyongbin3@h-partners.com> Signed-off-by: 欧派果奶我还要 <845473182@qq.com> Co-authored-by: 白永斌 <baiyongbin3@h-partners.com>
2025-11-30 22:52:05 +08:00
parent 18eefc23c3
commit bc67696a02
6 changed files with 139 additions and 50 deletions
--- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py
+++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py
@@ -44,11 +44,22 @@ class VllmEplbAdaptor(EplbAdaptor):
        self.init_redundancy_expert = get_ascend_config(
        ).init_redundancy_expert

+        for i in range(self.num_dense_layers,
+                       self.model.config.num_hidden_layers):
+            self.param_dict["model.layers." + str(i) + ".mlp.experts." + "w13_weight_list"] = \
+                self.model.model.layers[i].mlp.experts.w13_weight_list
+            self.param_dict["model.layers." + str(i) + ".mlp.experts." + "w2_weight_list"] = \
+                self.model.model.layers[i].mlp.experts.w2_weight_list
+            self.param_dict["model.layers." + str(i) + ".mlp.experts." + "w13_weight_scale_fp32_list"] = \
+                self.model.model.layers[i].mlp.experts.w13_weight_scale_fp32_list
+            self.param_dict["model.layers." + str(i) + ".mlp.experts." + "w2_weight_scale_list"] = \
+                self.model.model.layers[i].mlp.experts.w2_weight_scale_list
        # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 and qwen3-moe is supported here
        if self.model.quant_config is not None:
            self.expert_weight_names = [
-                "w13_weight", "w2_weight", "w13_weight_scale",
-                "w13_weight_offset", "w2_weight_scale", "w2_weight_offset"
+                "w13_weight_list", "w2_weight_list",
+                "w13_weight_scale_fp32_list", "w13_weight_offset",
+                "w2_weight_scale_list", "w2_weight_offset"
            ]
        else:
            self.expert_weight_names = ["w13_weight", "w2_weight"]
@@ -84,9 +95,14 @@ class VllmEplbAdaptor(EplbAdaptor):
            for name in self.expert_weight_names:
                complete_name = "model.layers." + str(
                    self.num_dense_layers) + ".mlp.experts." + name
-                expert_tensor = self.param_dict[complete_name].data[0]
-                if name in ["w13_weight", "w2_weight"]:
+                if name in [
+                        "w13_weight_list", "w2_weight_list",
+                        "w13_weight_scale_fp32_list", "w2_weight_scale_list"
+                ]:
+                    expert_tensor = self.param_dict[complete_name][0]
                    expert_tensor = expert_tensor.clone()
+                else:
+                    expert_tensor = self.param_dict[complete_name][0].data[0]
                buffer_tensor = torch.empty_like(expert_tensor)
                self.buffer_tensor_list[buffer_id].append(buffer_tensor)

@@ -97,12 +113,23 @@ class VllmEplbAdaptor(EplbAdaptor):
            layer_idx = self.num_dense_layers + moe_layer_id
            self.expert_param_per_layer[layer_idx] = list()
            for local_expert_id in range(num_local_expert):
-                self.expert_param_per_layer[layer_idx].append([
-                    self.param_dict["model.layers." + str(layer_idx) +
-                                    ".mlp.experts." +
-                                    name].data[local_expert_id]
-                    for name in self.expert_weight_names
-                ])
+                per_expert_param = list()
+                for name in self.expert_weight_names:
+                    if name in [
+                            "w13_weight_list", "w2_weight_list",
+                            "w13_weight_scale_fp32_list",
+                            "w2_weight_scale_list"
+                    ]:
+                        per_expert_param.append(
+                            self.param_dict["model.layers." + str(layer_idx) +
+                                            ".mlp.experts." +
+                                            name][local_expert_id])
+                    else:
+                        per_expert_param.append(
+                            self.param_dict["model.layers." + str(layer_idx) +
+                                            ".mlp.experts." +
+                                            name][0].data[local_expert_id])
+                self.expert_param_per_layer[layer_idx].append(per_expert_param)

    def get_rank_expert_workload(self) -> torch.Tensor:
        self.moe_load = self.model.get_all_moe_loads()