[BugFix]Fix incorrect get_current_vllm_config (#5121)
### What this PR does / why we need it?
This PR fixes some incorrect `get_current_vllm_config` calling, which
creates empty vllm_config instead.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Angazenn <supperccell@163.com>
This commit is contained in:
@@ -154,8 +154,7 @@ class AscendConfig:
|
||||
# npu_fused_infer_attention_score in some cases. We allow to execute
|
||||
# _npu_paged_attention in this cases. This should be removed once
|
||||
# npu_fused_infer_attention_score performs better on all scenarios.
|
||||
self.pa_shape_list = additional_config.get("pa_shape_list",
|
||||
[1, 2, 3, 4])
|
||||
self.pa_shape_list = additional_config.get("pa_shape_list", [])
|
||||
|
||||
kv_cfg = vllm_config.kv_transfer_config
|
||||
if kv_cfg is not None and not getattr(kv_cfg, "_engine_id_patched",
|
||||
|
||||
@@ -367,6 +367,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
kv_sharing_target_layer_name: Optional[str],
|
||||
**kwargs,
|
||||
) -> None:
|
||||
self.vllm_config = get_current_vllm_config()
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
@@ -723,7 +724,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
):
|
||||
num_tokens = query.shape[0]
|
||||
if (attn_metadata.attn_state == AscendAttentionState.DecodeOnly
|
||||
and using_paged_attention(num_tokens)
|
||||
and using_paged_attention(num_tokens, self.vllm_config)
|
||||
and self.sliding_window is None):
|
||||
output = self.forward_paged_attention(query, attn_metadata, output)
|
||||
else:
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer import (get_kv_transfer_group,
|
||||
has_kv_transfer_group,
|
||||
is_v1_kv_transfer_group)
|
||||
@@ -14,9 +13,7 @@ from vllm_ascend.utils import (AscendDeviceType, get_ascend_config,
|
||||
get_ascend_device_type)
|
||||
|
||||
|
||||
@lru_cache
|
||||
def using_paged_attention(runtime_shape: int) -> bool:
|
||||
vllm_config = get_current_vllm_config()
|
||||
def using_paged_attention(runtime_shape: int, vllm_config: VllmConfig) -> bool:
|
||||
if vllm_config.speculative_config is not None:
|
||||
return False
|
||||
if get_ascend_device_type() == AscendDeviceType.A5:
|
||||
|
||||
@@ -296,8 +296,9 @@ def _update_attn_fia_params(update_stream, forward_context, runtime_shape):
|
||||
event.record(update_stream)
|
||||
|
||||
|
||||
def update_attn_params(update_stream, forward_context, runtime_shape):
|
||||
if using_paged_attention(runtime_shape):
|
||||
def update_attn_params(update_stream, forward_context, runtime_shape,
|
||||
vllm_config):
|
||||
if using_paged_attention(runtime_shape, vllm_config):
|
||||
_update_attn_pa_params(update_stream, forward_context, runtime_shape)
|
||||
else:
|
||||
_update_attn_fia_params(update_stream, forward_context, runtime_shape)
|
||||
|
||||
@@ -23,8 +23,7 @@ from torch._inductor.pattern_matcher import (PatternMatcherPass,
|
||||
PatternPrettyPrinter)
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
|
||||
from vllm.config import (VllmConfig, get_current_vllm_config,
|
||||
get_layers_from_vllm_config)
|
||||
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
||||
|
||||
|
||||
class QKNormRopeFusionPattern:
|
||||
@@ -42,7 +41,6 @@ class QKNormRopeFusionPattern:
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.eps = eps
|
||||
vllm_config = get_current_vllm_config()
|
||||
self.device = vllm_config.device_config.device if vllm_config.device_config else None
|
||||
|
||||
def get_inputs(self):
|
||||
|
||||
@@ -1165,7 +1165,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
maybe_padded_num_tokens)
|
||||
else:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
maybe_padded_num_tokens)
|
||||
maybe_padded_num_tokens,
|
||||
self.vllm_config)
|
||||
|
||||
if get_forward_context().sp_enabled and not isinstance(
|
||||
hidden_states, IntermediateTensors):
|
||||
@@ -1957,7 +1958,7 @@ class NPUModelRunner(GPUModelRunner):
|
||||
positions.shape[0])
|
||||
else:
|
||||
update_attn_params(self.update_stream, forward_context,
|
||||
num_tokens)
|
||||
num_tokens, self.vllm_config)
|
||||
|
||||
if self.drafter and self.drafter.name == SpecDcodeType.EAGLE3:
|
||||
hidden_states, _ = hidden_states
|
||||
|
||||
Reference in New Issue
Block a user