[Refactor] Rename cudagraph_support to aclgraph_support (#3104)
### What this PR does / why we need it?
Updates the `cudagraph_support` attribute to `aclgraph_support` to use
terminology appropriate for the Ascend platform (ACL graphs instead of
CUDA graphs).
This change also explicitly disables graph support for the MLA attention
backend.
### Does this PR introduce _any_ user-facing change?
None.
### How was this patch tested?
None needed.
- vLLM version: v0.10.2
- vLLM main:
5aeb925452
Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -199,8 +199,8 @@ class AscendMetadata:
|
|||||||
|
|
||||||
|
|
||||||
class AscendAttentionMetadataBuilder:
|
class AscendAttentionMetadataBuilder:
|
||||||
# Does this backend/builder support CUDA Graphs for attention (default: no).
|
# Does this backend/builder support ACL Graphs for attention (default: no).
|
||||||
cudagraph_support: ClassVar[AttentionCGSupport] = \
|
aclgraph_support: ClassVar[AttentionCGSupport] = \
|
||||||
AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
|
AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
|
||||||
# Does this backend/builder reorder the batch?
|
# Does this backend/builder reorder the batch?
|
||||||
# If not, set this to None. Otherwise set it to the query
|
# If not, set this to None. Otherwise set it to the query
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Type, TypeVar
|
from typing import (TYPE_CHECKING, ClassVar, NamedTuple, Optional, Tuple, Type,
|
||||||
|
TypeVar)
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch_npu
|
import torch_npu
|
||||||
@@ -12,6 +13,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
|
|||||||
from vllm.model_executor.layers.linear import (LinearBase,
|
from vllm.model_executor.layers.linear import (LinearBase,
|
||||||
UnquantizedLinearMethod)
|
UnquantizedLinearMethod)
|
||||||
from vllm.utils import cdiv, round_down
|
from vllm.utils import cdiv, round_down
|
||||||
|
from vllm.v1.attention.backends.utils import AttentionCGSupport
|
||||||
|
|
||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||||
@@ -163,6 +165,9 @@ M = TypeVar("M", bound=AscendMLAMetadata)
|
|||||||
|
|
||||||
|
|
||||||
class AscendMLAMetadataBuilder:
|
class AscendMLAMetadataBuilder:
|
||||||
|
# Does this backend/builder support ACL Graphs for attention (default: no).
|
||||||
|
aclgraph_support: ClassVar[AttentionCGSupport] = \
|
||||||
|
AttentionCGSupport.NEVER
|
||||||
"""
|
"""
|
||||||
NOTE: Please read the comment at the top of the file before trying to
|
NOTE: Please read the comment at the top of the file before trying to
|
||||||
understand this class
|
understand this class
|
||||||
|
|||||||
@@ -3259,8 +3259,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
builder = attn_group.metadata_builder
|
builder = attn_group.metadata_builder
|
||||||
else:
|
else:
|
||||||
builder = attn_group.get_metadata_builder()
|
builder = attn_group.get_metadata_builder()
|
||||||
if builder.cudagraph_support.value < min_ag_support.value:
|
if builder.aclgraph_support.value < min_ag_support.value:
|
||||||
min_ag_support = builder.cudagraph_support
|
min_ag_support = builder.aclgraph_support
|
||||||
min_ag_builder_name = builder.__class__.__name__
|
min_ag_builder_name = builder.__class__.__name__
|
||||||
|
|
||||||
# This is an imitation of compilation_config.splitting_ops_contain_attention()
|
# This is an imitation of compilation_config.splitting_ops_contain_attention()
|
||||||
|
|||||||
Reference in New Issue
Block a user