From 29d2fe653d1864b1a8dbc87f3009099603dd0818 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 26 Dec 2025 14:07:37 +0800 Subject: [PATCH] cleanup ascend config (#5296) 1. refresh additional config doc 2. move kv config logic to platform. 3. improve `dump_config` init logic and rename it to `dump_config_path` this change is user impacted. dump_config is changed from dict to string. 4. correct `enable_async_exponential` type 5. remove useless `chunked_prefill_for_mla` - vLLM version: release/v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: wangxiyuan --- .../performance_and_debug/msprobe_guide.md | 2 +- .../configuration/additional_config.md | 59 ++++++++++------- ...test_prefix_cache_deepseek_r1_0528_w8a8.py | 5 +- .../models/test_deepseek_r1_0528_w8a8.py | 5 +- .../config/models/DeepSeek-R1-W8A8-A2.yaml | 4 +- tests/e2e/singlecard/test_sampler.py | 2 +- vllm_ascend/ascend_config.py | 64 +------------------ vllm_ascend/platform.py | 18 ++++-- vllm_ascend/sample/sampler.py | 2 +- vllm_ascend/utils.py | 31 +++++++++ vllm_ascend/worker/model_runner_v1.py | 24 +++---- 11 files changed, 98 insertions(+), 118 deletions(-) diff --git a/docs/source/developer_guide/performance_and_debug/msprobe_guide.md b/docs/source/developer_guide/performance_and_debug/msprobe_guide.md index 456809ba..b16d2918 100644 --- a/docs/source/developer_guide/performance_and_debug/msprobe_guide.md +++ b/docs/source/developer_guide/performance_and_debug/msprobe_guide.md @@ -99,7 +99,7 @@ JSON --enforce-eager \ --host 0.0.0.0 \ --port 8000 \ - --additional-config '{"dump_config": "/data/msprobe_config.json"}' & + --additional-config '{"dump_config_path": "/data/msprobe_config.json"}' & ``` ## 3. Send requests and collect dumps diff --git a/docs/source/user_guide/configuration/additional_config.md b/docs/source/user_guide/configuration/additional_config.md index a1130e72..f8f398d6 100644 --- a/docs/source/user_guide/configuration/additional_config.md +++ b/docs/source/user_guide/configuration/additional_config.md @@ -24,29 +24,35 @@ LLM(model="Qwen/Qwen3-8B", additional_config={"config_key":"config_value"}) The following table lists additional configuration options available in vLLM Ascend: -| Name | Type | Default | Description | -|-------------------------------------|------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------| -| `xlite_graph_config` | dict | `{}` | Configuration options for xlite graph mode | -| `finegrained_tp_config` | dict | `{}` | Configuration options for module tensor parallelism | -| `weight_prefetch_config` | dict | `{}` | Configuration options for weight prefetch | -| `refresh` | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case. | -| `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. | | +| Name | Type | Default | Description | +|-------------------------------------|------|---------|-----------------------------------------------------------------------------------------------------------| +| `xlite_graph_config` | dict | `{}` | Configuration options for xlite graph mode | +| `weight_prefetch_config` | dict | `{}` | Configuration options for weight prefetch | +| `finegrained_tp_config` | dict | `{}` | Configuration options for module tensor parallelism | +| `ascend_compilation_config` | dict | `{}` | Configuration options for ascend compilation | +| `refresh` | bool | `false` | Whether to refresh global Ascend configuration content. This is usually used by rlhf or ut/e2e test case. | +| `dump_config_path` | str | `None` | Configuration file path for msprobe dump(eager mode). | +| `enable_async_exponential` | bool | `False` | Whether to enable async exponential overlap. To enable async exponential, set this config to True. | | `enable_shared_expert_dp` | bool | `False` | When the expert is shared in DP, it delivers better performance but consumes more memory. Currently only DeepSeek series models are supported. | -| `lmhead_tensor_parallel_size` | int | `None` | The custom tensor parallel size of lmhead. Restriction: Can only be used when tensor_parallel=1 | -| `oproj_tensor_parallel_size` | int | `None` | The custom tensor parallel size of oproj. | -| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multistream shared expert. This option only takes effect on MoE models with shared experts. | -| `dynamic_eplb` | bool | `False` | Whether to enable dynamic EPLB. | -| `num_iterations_eplb_update` | int | `400` | Forward iterations when EPLB begins. | -| `gate_eplb` | bool | `False` | Whether to enable EPLB only once. | -| `num_wait_worker_iterations` | int | `30` | The forward iterations when the EPLB worker will finish CPU tasks. In our test default value 30 can cover most cases. | -| `expert_map_record_path` | str | `None` | Save the expert load calculation results to a new expert table in the specified directory. | -| `init_redundancy_expert` | int | `0` | Specify redundant experts during initialization. | -| `dump_config` | str | `None` | Configuration file path for msprobe dump(eager mode). | -| `enable_async_exponential` | int | `0` | Whether to enable async exponential overlap. To enable async exponential, set this config to 1. | +| `multistream_overlap_shared_expert` | bool | `False` | Whether to enable multistream shared expert. This option only takes effect on MoE models with shared experts. | +| `multistream_overlap_gate` | bool | `False` | Whether to enable multistream overlap gate. This option only takes effect on MoE models with shared experts. | +| `recompute_scheduler_enable` | bool | `False` | Whether to enable recompute scheduler. | +| `enable_cpu_binding` | bool | `False` | Whether to enable CPU binding. | +| `SLO_limits_for_dynamic_batch` | int | `-1` | SLO limits for dynamic batch. This is new scheduler to support dynamic feature | +| `enable_npugraph_ex` | bool | `False` | Whether to enable npugraph ex graph mode. | +| `pa_shape_list` | list | `[]` | The custom shape list of page attention ops. | +| `dynamic_eplb` | bool | `False` | Whether to enable dynamic EPLB. | +| `expert_map_path` | str | `None` | When using expert load balancing for an MoE model, an expert map path needs to be passed in. | +| `num_iterations_eplb_update` | int | `400` | Forward iterations when EPLB begins. | +| `gate_eplb` | bool | `False` | Whether to enable EPLB only once. | +| `num_wait_worker_iterations` | int | `30` | The forward iterations when the EPLB worker will finish CPU tasks. In our test default value 30 can cover most cases. | +| `expert_map_record_path` | str | `None` | Save the expert load calculation results to a new expert table in the specified directory. | +| `init_redundancy_expert` | int | `0` | Specify redundant experts during initialization. | The details of each configuration option are as follows: **xlite_graph_config** + | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable xlite graph mode. Currently only Llama, Qwen dense series models, and Qwen3-vl are supported. | @@ -57,16 +63,23 @@ The details of each configuration option are as follows: | Name | Type | Default | Description | |------------------|------|-------------------------------------------------------------|------------------------------------| | `enabled` | bool | `False` | Whether to enable weight prefetch. | -| `prefetch_ratio` | dict | `{"attn": {"qkv": 1.0, "o": 1.0}, "moe": {"gate_up": 0.8}}` | Prefetch ratio of each weight. | +| `prefetch_ratio` | dict | `{"attn": {"qkv": 1.0, "o": 1.0}, "moe": {"gate_up": 0.8}}` | Prefetch ratio of each weight. | **finegrained_tp_config** | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | -| `lmhead_tensor_parallel_size` | int | `0` | The custom tensor parallel size of lmhead. | -| `oproj_tensor_parallel_size` | int | `0` | The custom tensor parallel size of oproj. | -| `embedding_tensor_parallel_size` | int | `0` | The custom tensor parallel size of embedding. | -| `mlp_tensor_parallel_size` | int | `0` | The custom tensor parallel size of mlp. | +| `lmhead_tensor_parallel_size` | int | `0` | The custom tensor parallel size of lmhead. | +| `oproj_tensor_parallel_size` | int | `0` | The custom tensor parallel size of oproj. | +| `embedding_tensor_parallel_size` | int | `0` | The custom tensor parallel size of embedding. | +| `mlp_tensor_parallel_size` | int | `0` | The custom tensor parallel size of mlp. | + +**ascend_compilation_config** + +| Name | Type | Default | Description | +| ---- | ---- | ------- | ----------- | +| `fuse_norm_quant` | bool | `True` | Whether to enable fuse_norm_quant pass. | +| `fuse_qknorm_rope` | bool | `False` | Whether to enable fuse_qknorm_rope pass. It's set to True by default when Triton is installed. | ### Example diff --git a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py index a56dce5c..f1654036 100644 --- a/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/features/test_prefix_cache_deepseek_r1_0528_w8a8.py @@ -73,10 +73,7 @@ async def test_models(model: str) -> None: "HCCL_BUFFSIZE": "1024", "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", } - additional_config = { - "chunked_prefill_for_mla": True, - "enable_weight_nz_layout": True - } + additional_config = {"enable_weight_nz_layout": True} speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} server_args = [ "--quantization", "ascend", "--data-parallel-size", "2", diff --git a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py index 60162a76..0f1e4ff1 100644 --- a/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py +++ b/tests/e2e/nightly/models/test_deepseek_r1_0528_w8a8.py @@ -76,10 +76,7 @@ async def test_models(model: str, mode: str) -> None: "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True" } speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} - additional_config = { - "chunked_prefill_for_mla": True, - "enable_weight_nz_layout": True - } + additional_config = {"enable_weight_nz_layout": True} server_args = [ "--quantization", "ascend", "--data-parallel-size", "2", "--tensor-parallel-size", "8", "--enable-expert-parallel", "--port", diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml index f672dde5..636f7d7b 100644 --- a/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml +++ b/tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml @@ -31,7 +31,7 @@ deployment: --gpu-memory-utilization 0.9 --enforce-eager --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' - --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + --additional-config '{"enable_weight_nz_layout":true}' - server_cmd: > @@ -53,5 +53,5 @@ deployment: --gpu-memory-utilization 0.9 --enforce-eager --speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}' - --additional-config '{"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}' + --additional-config '{"enable_weight_nz_layout":true}' benchmarks: diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py index fbb03913..31c065cc 100644 --- a/tests/e2e/singlecard/test_sampler.py +++ b/tests/e2e/singlecard/test_sampler.py @@ -62,6 +62,6 @@ def test_qwen3_exponential_overlap() -> None: max_model_len=8192, gpu_memory_utilization=0.7, additional_config={ - "enable_async_exponential": 1, + "enable_async_exponential": True, }) as runner: runner.generate(example_prompts, sampling_params) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index c3474365..8be434a1 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -14,43 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional -from uuid import uuid4 from vllm.logger import logger from vllm.triton_utils import HAS_TRITON -def check_kv_extra_config(vllm_config): - - def _check(name: str, config: dict): - tp_key = "tp_size" - dp_key = "dp_size" - if tp_key in config: - config_tp = config[tp_key] - vllm_tp = vllm_config.parallel_config.tensor_parallel_size - if config_tp != vllm_tp: - raise ValueError( - f"KV transfer '{name}' config has a conflicting tensor parallel size. " - f"Expected {vllm_tp}, but got {config_tp}.") - if dp_key in config: - config_dp = config[dp_key] - vllm_dp = vllm_config.parallel_config.data_parallel_size - if config_dp != vllm_dp: - raise ValueError( - f"KV transfer '{name}' config has a conflicting data parallel size. " - f"Expected {vllm_dp}, but got {config_dp}.") - - if vllm_config.kv_transfer_config.is_kv_producer: - _check( - "prefill", - vllm_config.kv_transfer_config.get_from_extra_config( - "prefill", {})) - if vllm_config.kv_transfer_config.is_kv_consumer: - _check( - "decode", - vllm_config.kv_transfer_config.get_from_extra_config("decode", {})) - - class AscendConfig: """ Configuration Object for additional_config from vllm.configs. @@ -74,8 +42,7 @@ class AscendConfig: finegrained_tp_config, vllm_config) # Dump / PrecisionDebugger configuration - dump_config_path = additional_config.get("dump_config", None) - self.dump_config = DumpConfig(dump_config_path) + self.dump_config_path = additional_config.get("dump_config_path", None) weight_prefetch_config = additional_config.get( "weight_prefetch_config", {}) @@ -96,8 +63,6 @@ class AscendConfig: self.gate_eplb = additional_config.get("gate_eplb", False) self.num_wait_worker_iterations = additional_config.get( "num_wait_worker_iterations", 30) - self.chunked_prefill_for_mla = additional_config.get( - "chunked_prefill_for_mla", False) self.enable_shared_expert_dp = additional_config.get( "enable_shared_expert_dp", False) and vllm_config.parallel_config.enable_expert_parallel @@ -114,9 +79,6 @@ class AscendConfig: self.enable_cpu_binding = additional_config.get( "enable_cpu_binding", False) - if vllm_config.kv_transfer_config is not None: - check_kv_extra_config(vllm_config) - self.pd_tp_ratio = 1 self.pd_head_ratio = 1 self.num_head_replica = 1 @@ -156,16 +118,8 @@ class AscendConfig: # npu_fused_infer_attention_score performs better on all scenarios. self.pa_shape_list = additional_config.get("pa_shape_list", []) - kv_cfg = vllm_config.kv_transfer_config - if kv_cfg is not None and not getattr(kv_cfg, "_engine_id_patched", - False): - kv_cfg.engine_id = f"{kv_cfg.engine_id}-{uuid4().hex}" - kv_cfg._engine_id_patched = True - self.enable_async_exponential = additional_config.get( - "enable_async_exponential", 0) - if self.enable_async_exponential not in (0, 1): - raise AssertionError( - "Enable async exponential can only be set to 0 or 1.") + self.enable_async_exponential = bool( + additional_config.get("enable_async_exponential", False)) class FinegrainedTPConfig: @@ -274,18 +228,6 @@ class XliteGraphConfig: ) -class DumpConfig: - """ - Configuration object for dump/PrecisionDebugger settings. - """ - - def __init__(self, dump_config_path: Optional[str] = None): - # enable_dump is True when dump_cfg exists and config_path is not empty - self.enable_dump: bool = bool(dump_config_path) - # Path to msprobe config json; may be None. - self.config_path: Optional[str] = dump_config_path - - class WeightPrefetchConfig: """ Configuration Object for weight_prefetch_config from additional_config diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index b98e7104..cf86859e 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -18,6 +18,7 @@ import gc import os from typing import TYPE_CHECKING, Optional, Tuple +from uuid import uuid4 import torch from vllm.logger import logger @@ -30,12 +31,11 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.utils import refresh_block_size # isort: off -from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, - COMPRESSED_TENSORS_METHOD, AscendDeviceType, - enable_sp, get_ascend_device_type, is_vl_model, - update_aclgraph_sizes, - update_cudagraph_capture_sizes, - update_default_aclgraph_sizes) +from vllm_ascend.utils import ( + ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD, AscendDeviceType, + enable_sp, get_ascend_device_type, is_vl_model, update_aclgraph_sizes, + update_cudagraph_capture_sizes, update_default_aclgraph_sizes, + check_kv_extra_config) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -152,6 +152,12 @@ class NPUPlatform(Platform): # initialize ascend config from vllm additional_config ascend_config = init_ascend_config(vllm_config) + if vllm_config.kv_transfer_config is not None: + check_kv_extra_config(vllm_config) + if not getattr(vllm_config.kv_transfer_config, + "_engine_id_patched", False): + vllm_config.kv_transfer_config.engine_id = f"{vllm_config.kv_transfer_config.engine_id}-{uuid4().hex}" + vllm_config.kv_transfer_config._engine_id_patched = True from vllm.config import CompilationMode # noqa: E402 compilation_config = vllm_config.compilation_config diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py index de043e95..9363aa49 100644 --- a/vllm_ascend/sample/sampler.py +++ b/vllm_ascend/sample/sampler.py @@ -83,7 +83,7 @@ class AscendTopKTopPSampler(TopKTopPSampler): logits_to_return = logits.log_softmax(dim=-1, dtype=torch.float32) probs = logits.softmax(dim=-1, dtype=torch.float32) - if get_ascend_config().enable_async_exponential == 1: + if get_ascend_config().enable_async_exponential: # Add synchronize to prevent synchronize error. self.async_event.synchronize() return probs.div_(self.q).argmax(dim=-1).view(-1), logits_to_return diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 8ab33b49..97f8e2b6 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -1084,3 +1084,34 @@ def dispose_layer(layer: Any): def replace_layer(original_layer: Any, new_layer: Any): original_layer.__class__ = new_layer.__class__ original_layer.__dict__ = new_layer.__dict__ + + +def check_kv_extra_config(vllm_config): + + def _check(name: str, config: dict): + tp_key = "tp_size" + dp_key = "dp_size" + if tp_key in config: + config_tp = config[tp_key] + vllm_tp = vllm_config.parallel_config.tensor_parallel_size + if config_tp != vllm_tp: + raise ValueError( + f"KV transfer '{name}' config has a conflicting tensor parallel size. " + f"Expected {vllm_tp}, but got {config_tp}.") + if dp_key in config: + config_dp = config[dp_key] + vllm_dp = vllm_config.parallel_config.data_parallel_size + if config_dp != vllm_dp: + raise ValueError( + f"KV transfer '{name}' config has a conflicting data parallel size. " + f"Expected {vllm_dp}, but got {config_dp}.") + + if vllm_config.kv_transfer_config.is_kv_producer: + _check( + "prefill", + vllm_config.kv_transfer_config.get_from_extra_config( + "prefill", {})) + if vllm_config.kv_transfer_config.is_kv_consumer: + _check( + "decode", + vllm_config.kv_transfer_config.get_from_extra_config("decode", {})) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c2b11d92..9c924e7f 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -216,13 +216,12 @@ class NPUModelRunner(GPUModelRunner): self.ascend_config = get_ascend_config() set_weight_prefetch_method(self.ascend_config.weight_prefetch_config) # Dump / PrecisionDebugger configuration now comes from AscendConfig - dump_cfg = self.ascend_config.dump_config - self.dump_enable = dump_cfg.enable_dump + dump_cfg = self.ascend_config.dump_config_path self.debugger = None - if self.dump_enable: + if dump_cfg is not None: if self.model_config.enforce_eager: from msprobe.pytorch import PrecisionDebugger - self.debugger = PrecisionDebugger(dump_cfg.config_path) + self.debugger = PrecisionDebugger(dump_cfg) else: raise RuntimeError( "Dumping/debugging only works in eager mode.") @@ -1388,9 +1387,7 @@ class NPUModelRunner(GPUModelRunner): self.eplb_updator.take_update_info_from_eplb_process() # prevent debugger is None - need_dump = self.dump_enable and self.debugger is not None - if need_dump: - assert self.debugger is not None + if self.debugger is not None: dbg_cfg = getattr(self.debugger, "config", None) dump_level = str( getattr(dbg_cfg, "level", @@ -1407,7 +1404,7 @@ class NPUModelRunner(GPUModelRunner): aclgraph_runtime_mode, batch_descriptor = \ self.cudagraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora) - if self.ascend_config.enable_async_exponential != 0: + if self.ascend_config.enable_async_exponential: self.sampler.do_async_exponential( b_s=logits_indices.shape[0], head_dim=self.model_config.get_vocab_size(), @@ -1457,8 +1454,7 @@ class NPUModelRunner(GPUModelRunner): if not broadcast_pp_output: hidden_states.kv_connector_output = kv_connector_output self.kv_connector_output = kv_connector_output - if need_dump: - assert self.debugger is not None + if self.debugger is not None: self.debugger.stop() self.debugger.step() return hidden_states @@ -1472,8 +1468,7 @@ class NPUModelRunner(GPUModelRunner): hidden_states, scheduler_output.total_num_scheduled_tokens, num_scheduled_tokens_np) - if need_dump: - assert self.debugger is not None + if self.debugger is not None: self.debugger.stop() self.debugger.step() return pool_output @@ -1529,7 +1524,6 @@ class NPUModelRunner(GPUModelRunner): output.kv_connector_output = kv_connector_output return output - need_dump = self.dump_enable and self.debugger is not None # Unpack ephemeral state. ( scheduler_output, @@ -1628,13 +1622,13 @@ class NPUModelRunner(GPUModelRunner): if self.dynamic_eplb: self.eplb_updator.forward_end() if not self.use_async_scheduling: - if need_dump: + if self.debugger is not None: assert self.debugger is not None self.debugger.stop() self.debugger.step() return model_runner_output - if need_dump: + if self.debugger is not None: assert self.debugger is not None self.debugger.stop() self.debugger.step()