[BugFix]Fix incorrect get_current_vllm_config (#5121)

### What this PR does / why we need it?
This PR fixes some incorrect `get_current_vllm_config` calling, which
creates empty vllm_config instead.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: Angazenn <supperccell@163.com>
This commit is contained in:
Angazenn
2025-12-18 22:21:36 +08:00
committed by GitHub
parent fd9a47c04d
commit 632eab28b7
6 changed files with 12 additions and 15 deletions

View File

@@ -154,8 +154,7 @@ class AscendConfig:
# npu_fused_infer_attention_score in some cases. We allow to execute
# _npu_paged_attention in this cases. This should be removed once
# npu_fused_infer_attention_score performs better on all scenarios.
self.pa_shape_list = additional_config.get("pa_shape_list",
[1, 2, 3, 4])
self.pa_shape_list = additional_config.get("pa_shape_list", [])
kv_cfg = vllm_config.kv_transfer_config
if kv_cfg is not None and not getattr(kv_cfg, "_engine_id_patched",

View File

@@ -367,6 +367,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
kv_sharing_target_layer_name: Optional[str],
**kwargs,
) -> None:
self.vllm_config = get_current_vllm_config()
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)
@@ -723,7 +724,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
):
num_tokens = query.shape[0]
if (attn_metadata.attn_state == AscendAttentionState.DecodeOnly
and using_paged_attention(num_tokens)
and using_paged_attention(num_tokens, self.vllm_config)
and self.sliding_window is None):
output = self.forward_paged_attention(query, attn_metadata, output)
else:

View File

@@ -1,10 +1,9 @@
from dataclasses import dataclass
from functools import lru_cache
from typing import Any, List, Optional
import torch
import torch.nn.functional as F
from vllm.config import get_current_vllm_config
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer import (get_kv_transfer_group,
has_kv_transfer_group,
is_v1_kv_transfer_group)
@@ -14,9 +13,7 @@ from vllm_ascend.utils import (AscendDeviceType, get_ascend_config,
get_ascend_device_type)
@lru_cache
def using_paged_attention(runtime_shape: int) -> bool:
vllm_config = get_current_vllm_config()
def using_paged_attention(runtime_shape: int, vllm_config: VllmConfig) -> bool:
if vllm_config.speculative_config is not None:
return False
if get_ascend_device_type() == AscendDeviceType.A5:

View File

@@ -296,8 +296,9 @@ def _update_attn_fia_params(update_stream, forward_context, runtime_shape):
event.record(update_stream)
def update_attn_params(update_stream, forward_context, runtime_shape):
if using_paged_attention(runtime_shape):
def update_attn_params(update_stream, forward_context, runtime_shape,
vllm_config):
if using_paged_attention(runtime_shape, vllm_config):
_update_attn_pa_params(update_stream, forward_context, runtime_shape)
else:
_update_attn_fia_params(update_stream, forward_context, runtime_shape)

View File

@@ -23,8 +23,7 @@ from torch._inductor.pattern_matcher import (PatternMatcherPass,
PatternPrettyPrinter)
from vllm.attention.layer import Attention
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
from vllm.config import (VllmConfig, get_current_vllm_config,
get_layers_from_vllm_config)
from vllm.config import VllmConfig, get_layers_from_vllm_config
class QKNormRopeFusionPattern:
@@ -42,7 +41,6 @@ class QKNormRopeFusionPattern:
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.eps = eps
vllm_config = get_current_vllm_config()
self.device = vllm_config.device_config.device if vllm_config.device_config else None
def get_inputs(self):

View File

@@ -1165,7 +1165,8 @@ class NPUModelRunner(GPUModelRunner):
maybe_padded_num_tokens)
else:
update_attn_params(self.update_stream, forward_context,
maybe_padded_num_tokens)
maybe_padded_num_tokens,
self.vllm_config)
if get_forward_context().sp_enabled and not isinstance(
hidden_states, IntermediateTensors):
@@ -1957,7 +1958,7 @@ class NPUModelRunner(GPUModelRunner):
positions.shape[0])
else:
update_attn_params(self.update_stream, forward_context,
num_tokens)
num_tokens, self.vllm_config)
if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
hidden_states, _ = hidden_states