[EPLB]Eplb Config Renaming (#5533)

### What this PR does / why we need it? 1. Rename num_iterations_eplb_update to expert_heat_collection_interval. 2. Rename num_wait_worker_iterations to algorithm_execution_interval. 3. Rename init_redundancy_expert to num_redundant_experts because the variable with the same meaning in vLLM is named this way. 4. Delete gate_eplb because we don't need this feature. 5. Move eplb config into a dict in additional config. 6. Depend on pr5817 ### Does this PR introduce _any_ user-facing change? before this pr： `--additional-config '{"dynamic_eplb":true, "num_iterations_eplb_update": 4000, "num_wait_worker_iterations": 150, "init_redundancy_expert": 16, "expert_map_path": "xxx.json"}'` after this pr: `--additional-config '{"eplb_config":{"dynamic_eplb":true,"expert_heat_collection_interval":4000, "algorithm_execution_interval":150,"num_redundant_experts": 16, "expert_map_path": "xxx.json"}}'` ### How was this patch tested? #### test qwen3-235b eplb num_redundant_experts=16 without pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 83.33 | with pr5817 | dataset | version | metric | mode | vllm-api-general-chat | |----- | ----- | ----- | ----- | -----| | aime2024 | 604a78 | accuracy | gen | 86.67 | - vLLM version: v0.13.0 - vLLM main: 45c1ca1ca1 Signed-off-by: shenchuxiaofugui <1311027364@qq.com>
2026-01-15 10:26:44 +08:00
parent ea01aeaab7
commit da958ee386
21 changed files with 174 additions and 349 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from typing import TYPE_CHECKING, Optional

 from vllm.logger import logger
@@ -44,6 +45,9 @@ class AscendConfig:
        self.finegrained_tp_config = FinegrainedTPConfig(
            finegrained_tp_config, vllm_config)

+        eplb_config = additional_config.get("eplb_config", {})
+        self.eplb_config = EplbConfig(eplb_config)
+
        # Dump / PrecisionDebugger configuration
        self.dump_config_path = additional_config.get("dump_config_path", None)

@@ -58,20 +62,6 @@ class AscendConfig:
            "using it without these features may result in significant performance degradation."
        )

-        # Todo: Once https://github.com/vllm-project/vllm/issues/22246 is merged in vllm. Remove this config
-        self.expert_map_path = additional_config.get("expert_map_path", None)
-        self.eplb_policy_type = additional_config.get("eplb_policy_type", 1)
-        self.expert_map_record_path = additional_config.get(
-            "expert_map_record_path",
-            None)  # Provide path to export expert map
-        self.init_redundancy_expert = additional_config.get(
-            "init_redundancy_expert", 0)
-        self.dynamic_eplb = additional_config.get("dynamic_eplb", False)
-        self.num_iterations_eplb_update = additional_config.get(
-            "num_iterations_eplb_update", 400)
-        self.gate_eplb = additional_config.get("gate_eplb", False)
-        self.num_wait_worker_iterations = additional_config.get(
-            "num_wait_worker_iterations", 30)
        self.enable_shared_expert_dp = additional_config.get(
            "enable_shared_expert_dp",
            False) and vllm_config.parallel_config.enable_expert_parallel
@@ -275,6 +265,62 @@ class WeightPrefetchConfig:
            "prefetch_ratio", self.prefetch_ratio)


+class EplbConfig:
+    """
+    Configuration Object for xlite_graph_config from additional_config
+    """
+    _defaults = {
+        "dynamic_eplb": False,
+        "expert_map_path": None,
+        "expert_heat_collection_interval": 400,
+        "algorithm_execution_interval": 30,
+        "expert_map_record_path": None,
+        "num_redundant_experts": 0,
+        "eplb_policy_type": 1
+    }
+
+    def __init__(self, user_config: dict = {}):
+        self.config = self._defaults.copy()
+        if user_config and isinstance(user_config, dict):
+            for key, value in user_config.items():
+                if key in self.config:
+                    self.config[key] = value
+                else:
+                    raise ValueError(f"Config has no attribute '{key}'")
+
+        self._validate_config()
+
+    def __getattr__(self, key):
+        if key in self.config:
+            return self.config[key]
+        raise AttributeError(f"Config has no attribute '{key}'")
+
+    def _validate_config(self):
+        if self.expert_map_path is not None:
+            if self.expert_map_path[-5:] != ".json":
+                raise TypeError("The expert_map is not json.")
+            if not os.path.exists(self.expert_map_path):
+                raise ValueError("The expert_map is not exist.")
+        if self.expert_map_record_path is not None:
+            self.config["dynamic_eplb"] = True
+            if self.expert_map_record_path[-5:] != ".json":
+                raise TypeError("The expert_map_record_path is not json.")
+            dirname = os.path.dirname(self.expert_map_record_path)
+            os.makedirs(dirname, exist_ok=True)
+        for key in [
+                "expert_heat_collection_interval",
+                "algorithm_execution_interval", "num_redundant_experts"
+        ]:
+            if not isinstance(self.config[key], int):
+                raise TypeError(f"{key} must be an integer")
+            if self.config[key] < 0:  # type: ignore
+                raise ValueError(
+                    f"{key} must greater than 0; got {self.config[key]} instead"
+                )
+        if self.eplb_policy_type not in [0, 1, 2, 3]:
+            raise ValueError("eplb_policy_type must in [0, 1, 2, 3]")
+
+
 _ASCEND_CONFIG: Optional[AscendConfig] = None