[Misc][V0 Deprecation] Remove V0 Attention (#1835)
### What this PR does / why we need it?
This PR is a part of
https://github.com/vllm-project/vllm-ascend/issues/1620.
- vLLM version: v0.9.2
- vLLM main:
8dfb45ca33
Signed-off-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -15,7 +15,6 @@ from vllm.model_executor.layers.linear import (LinearBase,
|
||||
from vllm.utils import cdiv, round_down
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
|
||||
from vllm_ascend.multistream.context import get_multistream_comm_context
|
||||
@@ -27,6 +26,8 @@ from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CommonAttentionMetadata:
|
||||
|
||||
Reference in New Issue
Block a user