[Bugfix] Reset incompatible config (#6005)

### What this PR does / why we need it?
This PR introduces compatibility fixes for running vLLM on Ascend NPU
hardware. The changes ensure that GPU-specific parameters are
automatically detected and reset to Ascend-compatible values with
appropriate warnings logged.

| Module | Parameter | Default Value |
|--------|-----------|---------------|
| Model Config | `disable_cascade_attn` | `False` |
| Parallel Config | `all2all_backend` | `"allgather_reducescatter"` |
| Cache Config | `cpu_kvcache_space_bytes` | `None` |
| MultiModal Config | `mm_encoder_attn_backend` | `None` |
| Observability Config | `enable_layerwise_nvtx_tracing` | `False` |
| Scheduler Config | `max_num_partial_prefills` | `1` |
| Speculative Config | `quantization` | `None` |
| KV Transfer Config | `kv_buffer_size` | `1e9` |
| KV Transfer Config | `enable_permute_local_kv` | `False` |
| Attention Config | `use_prefill_decode_attention` | `False` |
| Attention Config | `use_cudnn_prefill` | `False` |
| Attention Config | `use_trtllm_ragged_deepseek_prefill` | `False` |
| Attention Config | `use_trtllm_attention` | `False` |
| Attention Config | `disable_flashinfer_prefill` | `False` |
| Attention Config | `disable_flashinfer_q_quantization` | `False` |
| Attention Config | `flash_attn_version` | `None` |
| Attention Config | `backend` | `None` |
| Attention Config | `flash_attn_max_num_splits_for_cuda_graph` | `32` |

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.13.0
- vLLM main:
2c24bc6996

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-01-20 11:02:38 +08:00
committed by GitHub
parent a8576ec610
commit a5b099c73d
4 changed files with 242 additions and 154 deletions

View File

@@ -171,6 +171,7 @@ class NPUPlatform(Platform):
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
# initialize ascend config from vllm additional_config
cls._fix_incompatible_config(vllm_config)
ascend_config = init_ascend_config(vllm_config)
if vllm_config.kv_transfer_config is not None:
@@ -620,3 +621,145 @@ class NPUPlatform(Platform):
"max_tokens_across_dp": max_tokens_across_dp,
"mc2_mask": mc2_mask,
}
@staticmethod
def _fix_incompatible_config(vllm_config: VllmConfig) -> None:
"""
Check and correct parameters in VllmConfig that are incompatible with Ascend NPU.
If GPU-specific or currently unsupported parameters are set by the user,
log a warning and reset them to safe values.
"""
# ==================== 1. Model Config ====================
if vllm_config.model_config:
# Disable Cascade Attention (GPU feature)
if getattr(vllm_config.model_config, "disable_cascade_attn", False):
logger.warning(
"Parameter '--disable-cascade-attn' is a GPU-specific feature. Resetting to False for Ascend."
)
vllm_config.model_config.disable_cascade_attn = False
# ==================== 2. Parallel Config ====================
if vllm_config.parallel_config:
# Only allow the default all2all backend; others like deepep are not supported
default_backend = "allgather_reducescatter"
current_backend = getattr(vllm_config.parallel_config, "all2all_backend", default_backend)
if current_backend != default_backend:
logger.warning(
"Parameter '--all2all-backend' is set to '%s', which may be "
"incompatible with Ascend. Using internal plugin mechanisms.",
current_backend,
)
vllm_config.parallel_config.all2all_backend = default_backend
# ==================== 3. Cache Config ====================
# Check and reset cpu_kvcache_space_bytes
if getattr(vllm_config.cache_config, "cpu_kvcache_space_bytes", False):
logger.warning(
"Parameter 'cpu_kvcache_space_bytes' is tied to cpu backend. Resetting to None for Ascend."
)
vllm_config.cache_config.cpu_kvcache_space_bytes = None
# ==================== 4. MultiModal Config ====================
if vllm_config.model_config.multimodal_config:
# Ascend uses a different mechanism for Multi-Modal attention
if getattr(vllm_config.model_config.multimodal_config, "mm_encoder_attn_backend", None) is not None:
logger.warning(
"Parameter '--mm-encoder-attn-backend' is set but Ascend uses "
"a plugin mechanism for multi-modal attention. Resetting to None."
)
vllm_config.model_config.multimodal_config.mm_encoder_attn_backend = None
# ==================== 5. Observability Config ====================
if vllm_config.observability_config:
# NVTX tracing is NVIDIA specific
if getattr(vllm_config.observability_config, "enable_layerwise_nvtx_tracing", False):
logger.warning(
"Parameter '--enable-layerwise-nvtx-tracing' relies on NVTX "
"(NVIDIA Tools) and is not supported on Ascend. Resetting to False."
)
vllm_config.observability_config.enable_layerwise_nvtx_tracing = False
# ==================== 6. Scheduler Config ====================
if vllm_config.scheduler_config:
# Partial prefills are specific to ROCm optimization
if getattr(vllm_config.scheduler_config, "max_num_partial_prefills", 1) != 1:
logger.warning(
"Parameter '--max-num-partial-prefills' is optimized for ROCm. Resetting to default (1) for Ascend."
)
vllm_config.scheduler_config.max_num_partial_prefills = 1
# ==================== 7. Speculative Config ====================
if vllm_config.speculative_config:
# Ascend automatically inherits main model quantization
if getattr(vllm_config.speculative_config, "quantization", None) is not None:
logger.warning(
"Speculative quantization is set but Ascend automatically uses "
"the main model's quantization method. Resetting to None."
)
vllm_config.speculative_config.quantization = None
# ==================== 8. KV Transfer Config ====================
if vllm_config.kv_transfer_config:
# Buffer size is primarily tied to NCCL (GPU) backends
current_buffer_size = getattr(vllm_config.kv_transfer_config, "kv_buffer_size", 1e9)
if current_buffer_size != 1e9:
logger.warning(
"Parameter 'kv_buffer_size' is optimized for NCCL and may be "
"incompatible with current Ascend KV transfer status. Resetting to default (1e9)."
)
# Use setattr to safely assign the value
vllm_config.kv_transfer_config.kv_buffer_size = 1e9
# Check and reset enable_permute_local_kv
if getattr(vllm_config.kv_transfer_config, "enable_permute_local_kv", False):
logger.warning(
"Parameter 'enable_permute_local_kv' is tied to NIXL backend. "
"Resetting to False for Ascend stability."
)
vllm_config.kv_transfer_config.enable_permute_local_kv = False
# ==================== 9. Attention Config ====================
if vllm_config.attention_config:
att_config = vllm_config.attention_config
# Boolean flags that must be False on Ascend (typically NVIDIA-specific)
force_false_flags = [
"use_prefill_decode_attention",
"use_cudnn_prefill",
"use_trtllm_ragged_deepseek_prefill",
"use_trtllm_attention",
"disable_flashinfer_prefill",
"disable_flashinfer_q_quantization",
]
for flag in force_false_flags:
if getattr(att_config, flag, False):
logger.warning(
"Ignored parameter '%s'. This is a GPU-specific feature "
"not supported on Ascend. Resetting to False.",
flag,
)
setattr(att_config, flag, False)
# Reset specific values to None as Ascend uses its own internal logic
if getattr(att_config, "flash_attn_version", None) is not None:
logger.warning(
"Ignored parameter 'flash_attn_version'. Ascend uses its own attention backend. Resetting to None."
)
att_config.flash_attn_version = None
# Notify user that the backend will be managed by Ascend plugins
if getattr(att_config, "backend", None) is not None:
logger.info(
"User specified attention backend '%s'. Note that Ascend NPU "
"will use its registered plugin backend instead. Resetting to None.",
att_config.backend,
)
att_config.backend = None
# CUDA Graph specific split points are not applicable
if getattr(att_config, "flash_attn_max_num_splits_for_cuda_graph", 32) != 32:
logger.warning(
"Parameter 'flash_attn_max_num_splits_for_cuda_graph' is "
"ignored on Ascend. Resetting to default (32)."
)
att_config.flash_attn_max_num_splits_for_cuda_graph = 32