2025-09-17 10:36:43 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
#
|
|
|
|
|
# Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this adaptor.
|
|
|
|
|
import json
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
import torch.distributed as dist
|
|
|
|
|
from vllm.logger import logger
|
|
|
|
|
|
2026-03-09 11:26:57 +08:00
|
|
|
import vllm_ascend.envs as envs_ascend
|
|
|
|
|
from vllm_ascend.quantization.methods.base import QuantType
|
|
|
|
|
|
2025-09-17 10:36:43 +08:00
|
|
|
|
[EPLB] Avoiding eplb's dependency on a specified model (#6528)
### What this PR does / why we need it?
1. Currently, eplb registers different attributes for different models,
but these attributes are not actually used. Now, these attributes are
directly deleted.
2. Add some log about eplb.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
#### Deepseek v3.1 chat
Of course! Here is a comprehensive explanation of deep learning, broken
down for clarity.\n\n### The Simple Analogy: A Child Learning to
Recognize a Cat\n\nImagine teaching a child what a cat is. You don't
give them a rulebook with instructions like \"has pointy ears, whiskers,
and a tail.\" Instead, you show them many pictures, saying \"this is a
cat\" or \"this is not a cat.\" The child's brain gradually learns to
identify the complex patterns—the combination of shapes, colors, and
textures—that define \"cat-ness.\"\n\n**Deep learning is essentially
this, but for computers.** It's a method for teaching computers to learn
from examples and recognize patterns directly from data (like images,
sound, or text) without being explicitly programmed with rigid
rules.\n\n---\n\n### The Technical Definition\n\n**Deep Learning is a
subfield of machine learning, which itself is a subfield of artificial
intelligence (AI).** It uses artificial **neural networks** with many
layers (\"deep\" networks) to model and understand complex patterns in
data.\n\nHere are the key concepts in that definition:\n\n1.
**Artificial Intelligence (AI):** The broad science of making machines
smart and capable of performing tasks that typically require human
intelligence.\n2. **Machine Learning (ML):** A subset of AI that gives
computers the ability to learn from data *without* being explicitly
programmed for every single rule.\n3. **Deep Learning (DL):** A
specific, powerful
- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-02-10 15:58:44 +08:00
|
|
|
class VllmEplbAdaptor:
|
2025-09-17 10:36:43 +08:00
|
|
|
def __init__(self, model, **args):
|
|
|
|
|
super().__init__(**args)
|
|
|
|
|
self.model = model
|
|
|
|
|
self.rank_id = dist.get_rank()
|
|
|
|
|
self.world_size = dist.get_world_size()
|
2026-01-19 09:24:25 +08:00
|
|
|
self.num_dense_layers = getattr(self.model.config, "first_k_dense_replace", 0)
|
2025-09-17 10:36:43 +08:00
|
|
|
self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers
|
|
|
|
|
|
2026-01-24 22:08:33 +08:00
|
|
|
self.expert_map_per_layer_cpu = dict() # copy of expert map on CPU to avoid device synchronize frequently
|
2025-09-17 10:36:43 +08:00
|
|
|
|
2026-01-26 14:28:16 +08:00
|
|
|
self.num_local_experts = self.model.model.layers[-1].mlp.experts.local_num_experts
|
2025-09-17 10:36:43 +08:00
|
|
|
self.expert_param_per_layer = dict()
|
|
|
|
|
self.init_expert_param_per_layer()
|
|
|
|
|
|
2026-01-26 14:28:16 +08:00
|
|
|
num_buffer_tensor = self.num_local_experts
|
|
|
|
|
self.buffer_tensor_list: list[list[Any]] = [[] for _ in range(num_buffer_tensor)]
|
|
|
|
|
self.init_buffer_tensor(num_buffer_tensor)
|
|
|
|
|
|
2025-09-17 10:36:43 +08:00
|
|
|
self.log2phy_map_per_layer = dict()
|
|
|
|
|
for layer_idx in range(self.num_moe_layers):
|
2026-01-24 22:08:33 +08:00
|
|
|
self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] = self.model.get_log2phy_map(
|
|
|
|
|
self.num_dense_layers + layer_idx
|
|
|
|
|
)
|
2025-09-17 10:36:43 +08:00
|
|
|
|
|
|
|
|
def init_buffer_tensor(self, num_buffer_tensor):
|
2025-10-11 14:04:02 +08:00
|
|
|
for buffer_id in range(num_buffer_tensor):
|
|
|
|
|
for name in self.expert_weight_names:
|
2026-01-24 22:08:33 +08:00
|
|
|
complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name
|
2026-01-26 14:28:16 +08:00
|
|
|
expert_tensor = self.param_dict[complete_name][0]
|
2025-10-11 14:04:02 +08:00
|
|
|
buffer_tensor = torch.empty_like(expert_tensor)
|
|
|
|
|
self.buffer_tensor_list[buffer_id].append(buffer_tensor)
|
2025-09-17 10:36:43 +08:00
|
|
|
|
|
|
|
|
def init_expert_param_per_layer(self):
|
2026-01-26 14:28:16 +08:00
|
|
|
self.param_dict = dict()
|
|
|
|
|
if self.model.quant_config is not None:
|
2026-03-09 11:26:57 +08:00
|
|
|
quant_type = self.model.model.layers[self.num_dense_layers].mlp.experts.quant_type
|
|
|
|
|
if quant_type == QuantType.W8A8:
|
|
|
|
|
self.expert_weight_names = [
|
|
|
|
|
"w13_weight_list",
|
|
|
|
|
"w2_weight_list",
|
|
|
|
|
"w13_weight_scale_fp32_list",
|
|
|
|
|
"w2_weight_scale_list",
|
|
|
|
|
]
|
|
|
|
|
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
|
|
|
|
|
self.expert_weight_names.append("fused_w1_scale_list")
|
|
|
|
|
self.expert_weight_names.append("fused_w2_scale_list")
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f"EPLB not support {quant_type}")
|
2026-01-26 14:28:16 +08:00
|
|
|
else:
|
|
|
|
|
self.expert_weight_names = ["w13_weight", "w2_weight"]
|
|
|
|
|
|
|
|
|
|
for layer_idx in range(self.num_dense_layers, self.model.config.num_hidden_layers):
|
2025-09-17 10:36:43 +08:00
|
|
|
self.expert_param_per_layer[layer_idx] = list()
|
2026-01-26 14:28:16 +08:00
|
|
|
for name in self.expert_weight_names:
|
|
|
|
|
param_key = f"model.layers.{layer_idx}.mlp.experts.{name}"
|
|
|
|
|
param_value = getattr(self.model.model.layers[layer_idx].mlp.experts, name)
|
|
|
|
|
self.param_dict[param_key] = param_value
|
|
|
|
|
for local_expert_id in range(self.num_local_experts):
|
2025-11-30 22:52:05 +08:00
|
|
|
per_expert_param = list()
|
|
|
|
|
for name in self.expert_weight_names:
|
2026-01-26 14:28:16 +08:00
|
|
|
per_expert_param.append(
|
|
|
|
|
self.param_dict["model.layers." + str(layer_idx) + ".mlp.experts." + name][local_expert_id]
|
|
|
|
|
)
|
2025-11-30 22:52:05 +08:00
|
|
|
self.expert_param_per_layer[layer_idx].append(per_expert_param)
|
2025-09-17 10:36:43 +08:00
|
|
|
|
|
|
|
|
def get_rank_expert_workload(self) -> torch.Tensor:
|
|
|
|
|
self.moe_load = self.model.get_all_moe_loads()
|
|
|
|
|
return self.moe_load
|
|
|
|
|
|
|
|
|
|
def _export_tensor_to_file(self, expert_maps, expert_map_record_path: str):
|
|
|
|
|
if self.rank_id == 0:
|
|
|
|
|
num_local_experts = expert_maps.max() + 1
|
|
|
|
|
|
[EPLB][Bugfix] Get expert map from layers (#5817)
### What this PR does / why we need it?
The initialization method of expert_map used by the eplb module is
different from that used by the fused_moe module. This PR deletes the
expert_map initialization method used by the eplb module to make the
initialization methods consistent.
#### before bugfix
self._expert_map=tensor([64, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61,62, 63], device='npu:1', dtype=torch.int32)
self.shared_dict["expert_maps"][0]=tensor([-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]], dtype=torch.int32)
### How was this patch tested?
#### qwen3-235B-w8a8 aime
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 86.67 |
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-14 09:16:51 +08:00
|
|
|
expert_maps_list = expert_maps.tolist()
|
2026-01-24 22:08:33 +08:00
|
|
|
record: dict[str, Any] = {"moe_layer_count": len(expert_maps_list), "layer_list": []}
|
2025-09-17 10:36:43 +08:00
|
|
|
|
|
|
|
|
for layer_idx, layer_data in enumerate(expert_maps_list):
|
|
|
|
|
layer_record: dict[str, Any] = {
|
|
|
|
|
"layer_id": layer_idx,
|
|
|
|
|
"device_count": len(layer_data),
|
2026-01-24 22:08:33 +08:00
|
|
|
"device_list": [],
|
2025-09-17 10:36:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for device_idx, experts in enumerate(layer_data):
|
2026-01-24 22:08:33 +08:00
|
|
|
placement = [experts.index(i) for i in range(num_local_experts)]
|
|
|
|
|
device_record = {"device_id": device_idx, "device_expert": placement}
|
2025-09-17 10:36:43 +08:00
|
|
|
layer_record["device_list"].append(device_record)
|
|
|
|
|
|
|
|
|
|
record["layer_list"].append(layer_record)
|
|
|
|
|
|
|
|
|
|
with open(expert_map_record_path, "w") as f:
|
|
|
|
|
json.dump(record, f, indent=4)
|
|
|
|
|
|
|
|
|
|
def do_update_expert_map(self, layer_id, updated_expert_map):
|
2025-11-29 15:18:29 +08:00
|
|
|
self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map)
|
2025-09-17 10:36:43 +08:00
|
|
|
|
2026-01-24 22:08:33 +08:00
|
|
|
def do_update_expert_weight(self, layer_id, local_expert_to_replace, buffer_tensor_id):
|
2025-09-17 10:36:43 +08:00
|
|
|
for expert_tensor, buffer_tensor in zip(
|
2026-01-24 22:08:33 +08:00
|
|
|
self.expert_param_per_layer[layer_id][local_expert_to_replace], self.buffer_tensor_list[buffer_tensor_id]
|
|
|
|
|
):
|
2025-11-29 15:18:29 +08:00
|
|
|
expert_tensor.copy_(buffer_tensor)
|
2025-09-17 10:36:43 +08:00
|
|
|
logger.debug(f"Expert tensor shape is :{expert_tensor.shape}")
|
|
|
|
|
|
|
|
|
|
def do_update_log2phy_map(self, layer_id, updated_log2phy_map):
|
|
|
|
|
if self.log2phy_map_per_layer[layer_id] is not None:
|
|
|
|
|
self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map)
|
|
|
|
|
|
[EPLB][Bugfix] Get expert map from layers (#5817)
### What this PR does / why we need it?
The initialization method of expert_map used by the eplb module is
different from that used by the fused_moe module. This PR deletes the
expert_map initialization method used by the eplb module to make the
initialization methods consistent.
#### before bugfix
self._expert_map=tensor([64, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61,62, 63], device='npu:1', dtype=torch.int32)
self.shared_dict["expert_maps"][0]=tensor([-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]], dtype=torch.int32)
### How was this patch tested?
#### qwen3-235B-w8a8 aime
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 86.67 |
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-14 09:16:51 +08:00
|
|
|
def get_global_expert_map(self):
|
|
|
|
|
all_layer_global_expert_map = []
|
|
|
|
|
for layer_id in range(self.num_moe_layers):
|
2026-01-24 22:08:33 +08:00
|
|
|
map_cpu = self.model.model.layers[self.num_dense_layers + layer_id].mlp.experts.global_expert_map.cpu()
|
[EPLB][Bugfix] Get expert map from layers (#5817)
### What this PR does / why we need it?
The initialization method of expert_map used by the eplb module is
different from that used by the fused_moe module. This PR deletes the
expert_map initialization method used by the eplb module to make the
initialization methods consistent.
#### before bugfix
self._expert_map=tensor([64, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61,62, 63], device='npu:1', dtype=torch.int32)
self.shared_dict["expert_maps"][0]=tensor([-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]], dtype=torch.int32)
### How was this patch tested?
#### qwen3-235B-w8a8 aime
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 86.67 |
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-14 09:16:51 +08:00
|
|
|
all_layer_global_expert_map.append(map_cpu)
|
2026-01-24 22:08:33 +08:00
|
|
|
self.expert_map_per_layer_cpu[self.num_dense_layers + layer_id] = map_cpu[self.rank_id]
|
2025-09-17 10:36:43 +08:00
|
|
|
|
[EPLB][Bugfix] Get expert map from layers (#5817)
### What this PR does / why we need it?
The initialization method of expert_map used by the eplb module is
different from that used by the fused_moe module. This PR deletes the
expert_map initialization method used by the eplb module to make the
initialization methods consistent.
#### before bugfix
self._expert_map=tensor([64, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2,
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61,62, 63], device='npu:1', dtype=torch.int32)
self.shared_dict["expert_maps"][0]=tensor([-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]], dtype=torch.int32)
### How was this patch tested?
#### qwen3-235B-w8a8 aime
| dataset | version | metric | mode | vllm-api-general-chat |
|----- | ----- | ----- | ----- | -----|
| aime2024 | 604a78 | accuracy | gen | 86.67 |
- vLLM version: v0.13.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2f4e6548efec402b913ffddc8726230d9311948d
Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-14 09:16:51 +08:00
|
|
|
return torch.stack(all_layer_global_expert_map)
|