diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index 51133d80..e2a3cc85 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -45,6 +45,7 @@ class TestAscendConfig(unittest.TestCase): self.vllm_config = vllm_config self.moe_config = moe_config self.mock_npu = patch("torch.Tensor.npu", new=lambda self: self).start() + os.environ["DYNAMIC_EPLB"] = "true" def test_init_eplb_config_with_eplb(self): eplb_config = init_ascend_config(self.vllm_config).eplb_config @@ -71,6 +72,6 @@ class TestAscendConfig(unittest.TestCase): eplb_config = init_ascend_config(self.vllm_config).eplb_config _, expert_map, log2phy, redundant_experts = init_eplb_config(eplb_config, 0, self.moe_config) gt_expert_map = torch.tensor([-1, -1, -1, -1, 0, 1, 2, 3]) - print(expert_map, log2phy, redundant_experts) + self.assertIsNone(log2phy) self.assertTrue(torch.equal(expert_map, gt_expert_map)) self.assertEqual(redundant_experts, 0) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 5521b252..31cf69c5 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -385,6 +385,7 @@ class EplbConfig: def _validate_config(self): if self.expert_map_path is not None: + logger.info(f"The expert_map is {self.config['dynamic_eplb']}") if self.expert_map_path[-5:] != ".json": raise TypeError("The expert_map is not json.") if not os.path.exists(self.expert_map_path): @@ -402,6 +403,14 @@ class EplbConfig: raise ValueError(f"{key} must greater than 0; got {self.config[key]} instead") if self.eplb_policy_type not in [0, 1, 2, 3]: raise ValueError("eplb_policy_type must in [0, 1, 2, 3]") + if self.config["dynamic_eplb"]: + assert ( + os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") + or os.getenv("EXPERT_MAP_RECORD", "false") == "true" + ), "The environment variable DYNAMIC_EPLB or EXPERT_MAP_RECORD of the ePLB must be set to true." + + logger.info(f"Dynamic EPLB is {self.config['dynamic_eplb']}") + logger.info(f"The number of redundant experts is {self.config['num_redundant_experts']}") _ASCEND_CONFIG: AscendConfig | None = None diff --git a/vllm_ascend/eplb/adaptor/abstract_adaptor.py b/vllm_ascend/eplb/adaptor/abstract_adaptor.py deleted file mode 100644 index ff58e170..00000000 --- a/vllm_ascend/eplb/adaptor/abstract_adaptor.py +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -# Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this adaptor. -from abc import abstractmethod -from typing import Any - - -class EplbAdaptor: - def __init__(self, **args): - pass - - @abstractmethod - def get_rank_expert_workload(self): - raise NotImplementedError - - @abstractmethod - def do_update_expert_map(self, layer_id: Any, updated_expert_map: Any) -> Any: - raise NotImplementedError - - @abstractmethod - def do_update_expert_weight(self, layer_id: Any, local_expert_to_replace: Any, buffer_tensor_id: Any) -> Any: - raise NotImplementedError diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 1e3783c5..cf414ac5 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -22,10 +22,8 @@ import torch import torch.distributed as dist from vllm.logger import logger -from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor - -class VllmEplbAdaptor(EplbAdaptor): +class VllmEplbAdaptor: def __init__(self, model, **args): super().__init__(**args) self.model = model diff --git a/vllm_ascend/eplb/utils.py b/vllm_ascend/eplb/utils.py index a6a577b3..93cb4d60 100644 --- a/vllm_ascend/eplb/utils.py +++ b/vllm_ascend/eplb/utils.py @@ -28,47 +28,25 @@ def get_log2phy_map(self, layer_id): return self.model.layers[layer_id].mlp.experts.get_log2phy_map() -def get_all_expert_map(self, num_moe_layers): - all_loads = [] - num_dense_layers = self.num_dense_layers if hasattr(self, "num_dense_layers") else 0 - for layer_id in range(num_moe_layers): - load_tensor = self.get_expert_map(layer_id + num_dense_layers) # (num_experts_per_layer,) - all_loads.append(load_tensor) - - return torch.stack(all_loads, dim=0) - - def get_all_moe_loads(self): - num_dense_layers = self.num_dense_layers if hasattr(self, "num_dense_layers") else 0 + num_dense_layers = getattr(self.model.config, "first_k_dense_replace", 0) + num_layers = self.model.config.num_hidden_layers all_moe_loads = torch.stack( - [ - self.model.layers[layer_id + num_dense_layers].mlp.experts.moe_load - for layer_id in range(self.num_moe_layers) - ], + [self.model.layers[layer_id].mlp.experts.moe_load for layer_id in range(num_dense_layers, num_layers)], dim=0, ) return all_moe_loads def clear_all_moe_loads(self): - num_dense_layers = self.num_dense_layers if hasattr(self, "num_dense_layers") else 0 - for layer_id in range(self.num_moe_layers): - self.model.layers[layer_id + num_dense_layers].mlp.experts.clear_moe_load() + num_dense_layers = getattr(self.model.config, "first_k_dense_replace", 0) + num_layers = self.model.config.num_hidden_layers + for layer_id in range(num_dense_layers, num_layers): + self.model.layers[layer_id].mlp.experts.clear_moe_load() -def model_register(model, model_config): +def model_register(model): model.get_expert_map = types.MethodType(get_expert_map, model) model.get_log2phy_map = types.MethodType(get_log2phy_map, model) - model.get_all_expert_map = types.MethodType(get_all_expert_map, model) model.get_all_moe_loads = types.MethodType(get_all_moe_loads, model) model.clear_all_moe_loads = types.MethodType(clear_all_moe_loads, model) - - config = model_config.hf_text_config - - if config.model_type == "qwen3_moe": - model.num_moe_layers = config.num_hidden_layers - elif config.model_type == "deepseek_v2" or config.model_type == "deepseek_v3": - model.num_dense_layers = config.first_k_dense_replace - model.num_moe_layers = config.num_hidden_layers - model.num_dense_layers - else: - raise NotImplementedError("EPLB is not supported.") diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index db701ed2..7da147e2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2308,7 +2308,7 @@ class NPUModelRunner(GPUModelRunner): with DeviceMemoryProfiler() as m: # noqa: SIM117 self.model = get_model(vllm_config=self.vllm_config) if self.dynamic_eplb: - model_register(self.model, self.model_config) + model_register(self.model) if self.drafter: logger.info("Loading drafter model...") with get_tp_context(self.drafter):