xc-llm-ascend/vllm_ascend/attention/utils.py

import functools
from dataclasses import dataclass
from typing import Any, List

import torch
import torch_npu
from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                          has_kv_transfer_group,
                                          is_v1_kv_transfer_group)
from vllm.forward_context import ForwardContext, get_forward_context


@dataclass
class AscendCommonAttentionMetadata:
    """
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.
    
    For many of the tensors we keep both GPU and CPU versions.
    """

    query_start_loc: torch.Tensor
    query_start_loc_cpu: torch.Tensor
    """(batch_size + 1,), the start location of each request in query Tensor"""

    seq_lens_cpu: torch.Tensor
    """(batch_size,), the length of each request including both computed tokens
    and newly scheduled tokens"""

    seq_lens: torch.Tensor
    """same to seq_lens_cpu, for compatibility with some new attn metadata
    (such as GDN)."""

    num_computed_tokens_cpu: torch.Tensor
    """(batch_size,), the number of computed tokens for each request"""

    num_reqs: int
    """Number of requests"""
    num_actual_tokens: int
    """Total number of tokens in batch"""

    max_query_len: int
    """Max token number of request in batch"""

    decode_token_per_req: int
    """decode token number per request"""

    block_table_tensor: torch.Tensor

    slot_mapping: torch.Tensor

    actual_seq_lengths_q: list[int]

    positions: torch.Tensor = None

    attn_mask: torch.Tensor = None

    spec_attn_mask: torch.Tensor = None

    attn_state: Any = None

    enable_dbo_across_dp: bool = False

    is_only_prefill: bool = False

    graph_pad_size: int = -1

    # NOTE: This is a temporary solution for rotary embedding in MLA
    cos: torch.Tensor = None
    sin: torch.Tensor = None


def split_decodes_and_prefills(
    common_attn_metadata: AscendCommonAttentionMetadata,
    decode_threshold: int = 1,
) -> tuple[int, int, int, int]:
    """
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: AscendCommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.

    Returns:
        num_decodes: The number of decode requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    """
    max_query_len = common_attn_metadata.max_query_len
    num_reqs = common_attn_metadata.num_reqs
    num_tokens = common_attn_metadata.num_actual_tokens
    query_start_loc = common_attn_metadata.query_start_loc_cpu

    if max_query_len <= decode_threshold:
        return num_reqs, 0, num_tokens, 0

    query_lens = query_start_loc[1:] - query_start_loc[:-1]
    is_prefill = query_lens > decode_threshold
    if not torch.any(is_prefill):
        return num_reqs, 0, num_tokens, 0

    first_prefill = is_prefill.int().argmax(dim=-1).item()
    assert torch.all(query_lens[first_prefill:] > decode_threshold)
    assert torch.all(query_lens[:first_prefill] <= decode_threshold)
    num_decodes = first_prefill
    num_prefills = num_reqs - num_decodes
    num_decode_tokens = query_start_loc[first_prefill].item()
    num_prefill_tokens = num_tokens - num_decode_tokens
    return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)


def wait_for_kv_layer_from_connector(layer_name: str):
    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
        return

    connector = get_kv_transfer_group()

    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if attn_metadata is None:
        return
    # TODO: assert ascendMetadata
    connector.wait_for_layer_load(layer_name)


def maybe_save_kv_layer_to_connector(
    layer_name: str,
    kv_cache_layer: List[torch.Tensor],
):
    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
        return

    connector = get_kv_transfer_group()

    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if attn_metadata is None:
        return
    # TODO: assert ascendMetadata
    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)


@functools.cache
def version_check():
    import re
    torch_npu_version = torch_npu.version.__version__
    date_pattern = r'dev(\d{8})'

    match = re.search(date_pattern, torch_npu_version)
    if match:
        full_date = match.group(1)
        if full_date >= "20250919":
            return True
    return False
fix pagedattention to support fullgraph. (#3436) ### What this PR does / why we need it? Calculate in advance the workspace memory size needed for the PagedAttention operator to avoid deadlocks during resource cleanup. This PR requires torch_npu version 0920 or newer. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-10-14 16:10:09 +08:00			`import functools`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00			`from dataclasses import dataclass`
[Feature]cpu offload connector (#1659) This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5aeb9254521023f97aca292b3478aa7ff485ffb2 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com> 2025-09-23 14:25:05 +08:00			`from typing import Any, List`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00
			`import torch`
fix pagedattention to support fullgraph. (#3436) ### What this PR does / why we need it? Calculate in advance the workspace memory size needed for the PagedAttention operator to avoid deadlocks during resource cleanup. This PR requires torch_npu version 0920 or newer. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-10-14 16:10:09 +08:00			`import torch_npu`
[Feature]cpu offload connector (#1659) This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5aeb9254521023f97aca292b3478aa7ff485ffb2 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com> 2025-09-23 14:25:05 +08:00			`from vllm.distributed.kv_transfer import (get_kv_transfer_group,`
			`has_kv_transfer_group,`
			`is_v1_kv_transfer_group)`
			`from vllm.forward_context import ForwardContext, get_forward_context`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00

			`@dataclass`
			`class AscendCommonAttentionMetadata:`
			`"""`
			`Per-batch attention metadata, shared across layers and backends.`
			`AttentionMetadataBuilder instances use it to construct per-layer metadata.`

			`For many of the tensors we keep both GPU and CPU versions.`
			`"""`

			`query_start_loc: torch.Tensor`
			`query_start_loc_cpu: torch.Tensor`
			`"""(batch_size + 1,), the start location of each request in query Tensor"""`

			`seq_lens_cpu: torch.Tensor`
			`"""(batch_size,), the length of each request including both computed tokens`
			`and newly scheduled tokens"""`

[New model] Qwen3-next support (#2917) ### What this PR does / why we need it? Add Qwen3-next support. ### Does this PR introduce _any_ user-facing change? Yes, users can use Qwen3 next. Related doc: https://github.com/vllm-project/vllm-ascend/pull/2916 the tutorial will be ready in [here](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) ### How was this patch tested? Doc CI passed Related: https://github.com/vllm-project/vllm-ascend/issues/2884 Co-Authored-By: Angazenn <supperccell@163.com> Co-Authored-By: zzzzwwjj <1183291235@qq.com> Co-Authored-By: MengqingCao <cmq0113@163.com> Co-Authored-By: linfeng-yuan <1102311262@qq.com> Co-Authored-By: hust17yixuan <303660421@qq.com> Co-Authored-By: SunnyLee219 <3294305115@qq.com> Co-Authored-By: maoxx241 <maoxx241@umn.edu> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/b834b4cbf1d5094affdf231df2be86920610d83e --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Angazenn <supperccell@163.com> Signed-off-by: Your Name <you@example.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Angazenn <supperccell@163.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: hust17yixuan <303660421@qq.com> 2025-09-16 01:17:42 +08:00			`seq_lens: torch.Tensor`
			`"""same to seq_lens_cpu, for compatibility with some new attn metadata`
			`(such as GDN)."""`

			`num_computed_tokens_cpu: torch.Tensor`
			`"""(batch_size,), the number of computed tokens for each request"""`

Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00			`num_reqs: int`
			`"""Number of requests"""`
			`num_actual_tokens: int`
			`"""Total number of tokens in batch"""`

			`max_query_len: int`
			`"""Max token number of request in batch"""`

			`decode_token_per_req: int`
			`"""decode token number per request"""`

			`block_table_tensor: torch.Tensor`

[Feat][Graph] Support `FULL_DECODE_ONLY` mode for GQA/MHA models (#2128) Note: This depends on [vLLM #25161](https://github.com/vllm-project/vllm/pull/25161) and the torch\_npu release from September 30. ### What this PR does / why we need it? This pull request adds `FULL_DECODE_ONLY` mode for GQA/MHA models (MLA models like DeepSeek V3/R1 are not included). Key improvements include: * Reduced dispatch latency: By replaying the entire model execution graph at once, we cut overhead compared with multiple smaller replays. * Stabilized multi-device performance: Captureing the whole model as one static graph also mitigates the dispatch fluctuations across devices. * Stream/resource savings: Consolidating graph captures frees up streams, allowing more graphs to be captured. Known issues: 1. `_npu_paged_attention` currently manages its own workspace in `torch_npu`, which can deadlock when synchronizing during graph replay — we’re working on a fix. There may be other corner cases. This PR is the first in a planned series; we’ll continue to iterate and address remaining issues in follow-ups. This is essentially a port of #1503 and #1677, but includes two major changes: 1. Let `graph_dispatcher` decide the graph mode instead of hard-coding it in the backend, which decouples Full Graph and Piecewise Graph and could make it possible to remove dynamo. 2. Adapt to the new `attn_group` logic, but leave a small hack in `update_graph_params`; multi-attention models may or may not be fully supported yet. ### Does this PR introduce _any_ user-facing change? ```python compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", }, ``` ### How was this patch tested? Tests included. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/9607d5eb449711b349d4c2bee0a9c94afcc7ed14 --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> 2025-09-22 17:14:28 +08:00			`slot_mapping: torch.Tensor`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00
			`actual_seq_lengths_q: list[int]`

			`positions: torch.Tensor = None`

			`attn_mask: torch.Tensor = None`

			`spec_attn_mask: torch.Tensor = None`

			`attn_state: Any = None`

			`enable_dbo_across_dp: bool = False`

			`is_only_prefill: bool = False`

			`graph_pad_size: int = -1`

[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125) ### What this PR does / why we need it? Adds support for capturing the Multi-Layer Attention (MLA) decode operation into an ACL graph. This improves performance by compiling the attention kernel for single-token decoding. Key changes include: - Implementing the graph capture logic for the MLA kernel, including workspace management and parameter updates. - Modifying the rotary embedding (RoPE) handling to use pre-allocated tensors, which is a requirement for graph capture. - Adding a `build_for_graph_capture` method to the MLA metadata builder to create dummy metadata during the graph compilation phase. Known issues: - Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're working on a fix - We are preparing to remove update_mla_attn_params with auto_dispatch_capture ### Does this PR introduce _any_ user-facing change? compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", }, ### How was this patch tested? - vLLM version: v0.11.0 --------- Signed-off-by: panchao-hub <315134829@qq.com> Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com> 2025-10-10 16:31:20 +08:00			`# NOTE: This is a temporary solution for rotary embedding in MLA`
			`cos: torch.Tensor = None`
			`sin: torch.Tensor = None`

Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00
			`def split_decodes_and_prefills(`
			`common_attn_metadata: AscendCommonAttentionMetadata,`
			`decode_threshold: int = 1,`
			`) -> tuple[int, int, int, int]:`
			`"""`
			`Assuming a reordered batch, finds the boundary between prefill and decode`
			`requests.`

			`Args:`
			`common_attn_metadata: AscendCommonAttentionMetadata object containing the`
			`batch metadata.`
			`decode_threshold: The maximum query length to be considered a decode.`

			`Returns:`
			`num_decodes: The number of decode requests.`
			`num_prefills: The number of prefill requests.`
			`num_decode_tokens: The number of tokens in the decode requests.`
			`num_prefill_tokens: The number of tokens in the prefill requests.`
			`"""`
			`max_query_len = common_attn_metadata.max_query_len`
			`num_reqs = common_attn_metadata.num_reqs`
			`num_tokens = common_attn_metadata.num_actual_tokens`
			`query_start_loc = common_attn_metadata.query_start_loc_cpu`

			`if max_query_len <= decode_threshold:`
			`return num_reqs, 0, num_tokens, 0`

			`query_lens = query_start_loc[1:] - query_start_loc[:-1]`
			`is_prefill = query_lens > decode_threshold`
			`if not torch.any(is_prefill):`
			`return num_reqs, 0, num_tokens, 0`

			`first_prefill = is_prefill.int().argmax(dim=-1).item()`
bugfix for mtp (#3300) ### What this PR does / why we need it? when mtp>1, we need refresh cos ans sin in each step. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.11.0 Signed-off-by: zouyida2052 <zouyida2002@gmail.com> 2025-10-09 19:22:46 +08:00			`assert torch.all(query_lens[first_prefill:] > decode_threshold)`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00			`assert torch.all(query_lens[:first_prefill] <= decode_threshold)`
			`num_decodes = first_prefill`
			`num_prefills = num_reqs - num_decodes`
			`num_decode_tokens = query_start_loc[first_prefill].item()`
			`num_prefill_tokens = num_tokens - num_decode_tokens`
			`return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)`
[Feature]cpu offload connector (#1659) This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5aeb9254521023f97aca292b3478aa7ff485ffb2 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com> 2025-09-23 14:25:05 +08:00

			`def wait_for_kv_layer_from_connector(layer_name: str):`
			`if not has_kv_transfer_group() or not is_v1_kv_transfer_group():`
			`return`

			`connector = get_kv_transfer_group()`

			`forward_context: ForwardContext = get_forward_context()`
			`attn_metadata = forward_context.attn_metadata`
			`if attn_metadata is None:`
			`return`
			`# TODO: assert ascendMetadata`
			`connector.wait_for_layer_load(layer_name)`


			`def maybe_save_kv_layer_to_connector(`
			`layer_name: str,`
			`kv_cache_layer: List[torch.Tensor],`
			`):`
			`if not has_kv_transfer_group() or not is_v1_kv_transfer_group():`
			`return`

			`connector = get_kv_transfer_group()`

			`forward_context: ForwardContext = get_forward_context()`
			`attn_metadata = forward_context.attn_metadata`
			`if attn_metadata is None:`
			`return`
			`# TODO: assert ascendMetadata`
			`connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)`
fix pagedattention to support fullgraph. (#3436) ### What this PR does / why we need it? Calculate in advance the workspace memory size needed for the PagedAttention operator to avoid deadlocks during resource cleanup. This PR requires torch_npu version 0920 or newer. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-10-14 16:10:09 +08:00

			`@functools.cache`
			`def version_check():`
			`import re`
			`torch_npu_version = torch_npu.version.__version__`
			`date_pattern = r'dev(\d{8})'`

			`match = re.search(date_pattern, torch_npu_version)`
			`if match:`
			`full_date = match.group(1)`
			`if full_date >= "20250919":`
			`return True`
			`return False`