cleanup ascend config (#5296)

1. refresh additional config doc 2. move kv config logic to platform. 3. improve `dump_config` init logic and rename it to `dump_config_path` this change is user impacted. dump_config is changed from dict to string. 4. correct `enable_async_exponential` type 5. remove useless `chunked_prefill_for_mla` - vLLM version: release/v0.13.0 - vLLM main: ad32e3e19c Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-12-26 14:07:37 +08:00
parent adaa89a7a5
commit 29d2fe653d
11 changed files with 98 additions and 118 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -216,13 +216,12 @@ class NPUModelRunner(GPUModelRunner):
        self.ascend_config = get_ascend_config()
        set_weight_prefetch_method(self.ascend_config.weight_prefetch_config)
        # Dump / PrecisionDebugger configuration now comes from AscendConfig
-        dump_cfg = self.ascend_config.dump_config
-        self.dump_enable = dump_cfg.enable_dump
+        dump_cfg = self.ascend_config.dump_config_path
        self.debugger = None
-        if self.dump_enable:
+        if dump_cfg is not None:
            if self.model_config.enforce_eager:
                from msprobe.pytorch import PrecisionDebugger
-                self.debugger = PrecisionDebugger(dump_cfg.config_path)
+                self.debugger = PrecisionDebugger(dump_cfg)
            else:
                raise RuntimeError(
                    "Dumping/debugging only works in eager mode.")
@@ -1388,9 +1387,7 @@ class NPUModelRunner(GPUModelRunner):
                self.eplb_updator.take_update_info_from_eplb_process()

        # prevent debugger is None
-        need_dump = self.dump_enable and self.debugger is not None
-        if need_dump:
-            assert self.debugger is not None
+        if self.debugger is not None:
            dbg_cfg = getattr(self.debugger, "config", None)
            dump_level = str(
                getattr(dbg_cfg, "level",
@@ -1407,7 +1404,7 @@ class NPUModelRunner(GPUModelRunner):
        aclgraph_runtime_mode, batch_descriptor = \
            self.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)

-        if self.ascend_config.enable_async_exponential != 0:
+        if self.ascend_config.enable_async_exponential:
            self.sampler.do_async_exponential(
                b_s=logits_indices.shape[0],
                head_dim=self.model_config.get_vocab_size(),
@@ -1457,8 +1454,7 @@ class NPUModelRunner(GPUModelRunner):
                if not broadcast_pp_output:
                    hidden_states.kv_connector_output = kv_connector_output
                    self.kv_connector_output = kv_connector_output
-                    if need_dump:
-                        assert self.debugger is not None
+                    if self.debugger is not None:
                        self.debugger.stop()
                        self.debugger.step()
                    return hidden_states
@@ -1472,8 +1468,7 @@ class NPUModelRunner(GPUModelRunner):
                        hidden_states,
                        scheduler_output.total_num_scheduled_tokens,
                        num_scheduled_tokens_np)
-                    if need_dump:
-                        assert self.debugger is not None
+                    if self.debugger is not None:
                        self.debugger.stop()
                        self.debugger.step()
                    return pool_output
@@ -1529,7 +1524,6 @@ class NPUModelRunner(GPUModelRunner):
            output.kv_connector_output = kv_connector_output
            return output

-        need_dump = self.dump_enable and self.debugger is not None
        # Unpack ephemeral state.
        (
            scheduler_output,
@@ -1628,13 +1622,13 @@ class NPUModelRunner(GPUModelRunner):
        if self.dynamic_eplb:
            self.eplb_updator.forward_end()
        if not self.use_async_scheduling:
-            if need_dump:
+            if self.debugger is not None:
                assert self.debugger is not None
                self.debugger.stop()
                self.debugger.step()
            return model_runner_output

-        if need_dump:
+        if self.debugger is not None:
            assert self.debugger is not None
            self.debugger.stop()
            self.debugger.step()