[Fix] Refactor dummy attention metadata creation (#3497)
### What this PR does / why we need it? The `force_attention` parameter is designed for flash infer kernel warmup, we don't actually need it on Ascend device (at least for now).And it tends to make things more complicated. So we replace the `force_attention` parameter with `aclgraph_runtime_mode` in the attention metadata creation logic. This change makes the control flow more explicit by directly using the graph runtime mode to determine how to build attention metadata, rather than relying on an intermediate boolean flag. This simplification removes redundant logic and clarifies the conditions for building attention metadata for full decode graph mode. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? DP + `FULL_DECODE_ONLY` + online serving. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -19,13 +19,13 @@
|
||||
|
||||
import math
|
||||
import types
|
||||
from typing import Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
import torch_npu
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config import CUDAGraphMode, VllmConfig
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.distributed.parallel_state import get_dp_group
|
||||
from vllm.forward_context import get_forward_context
|
||||
@@ -147,14 +147,21 @@ class NPUTorchairModelRunner(NPUModelRunner):
|
||||
|
||||
return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, enable_dbo
|
||||
|
||||
def _build_attention_metadata(self, with_prefill, num_reqs, num_tokens,
|
||||
max_query_len, force_attention):
|
||||
def _build_dummy_attn_metadata(
|
||||
self,
|
||||
with_prefill: bool,
|
||||
num_reqs: int,
|
||||
num_tokens: int,
|
||||
max_query_len: int,
|
||||
aclgraph_runtime_mode: Optional[CUDAGraphMode] = None,
|
||||
force_attention: bool = False,
|
||||
) -> Optional[dict[str, Any]]:
|
||||
# NOTE: If torchair graph mode and not with_prefill,
|
||||
# we can't skip_attn, it will cause graph recompile.
|
||||
if with_prefill or self.enable_shared_expert_dp:
|
||||
attn_metadata = super()._build_attention_metadata(
|
||||
attn_metadata = super()._build_dummy_attn_metadata(
|
||||
with_prefill, num_reqs, num_tokens, max_query_len,
|
||||
force_attention)
|
||||
aclgraph_runtime_mode, force_attention)
|
||||
else:
|
||||
common_attn_metadata = TorchairCommonAttentionMetadata(
|
||||
num_reqs=num_reqs,
|
||||
|
||||
Reference in New Issue
Block a user