xc-llm-ascend/vllm_ascend/attention/utils.py

from dataclasses import dataclass
from typing import Any, List

import torch
import torch.nn.functional as F
from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                          has_kv_transfer_group,
                                          is_v1_kv_transfer_group)
from vllm.forward_context import ForwardContext, get_forward_context


@dataclass
class AscendCommonAttentionMetadata:
    """
    Per-batch attention metadata, shared across layers and backends.
    AttentionMetadataBuilder instances use it to construct per-layer metadata.
    
    For many of the tensors we keep both GPU and CPU versions.
    """

    query_start_loc: torch.Tensor
    query_start_loc_cpu: torch.Tensor
    """(batch_size + 1,), the start location of each request in query Tensor"""

    seq_lens_cpu: torch.Tensor
    """(batch_size,), the length of each request including both computed tokens
    and newly scheduled tokens"""

    seq_lens: torch.Tensor
    """same to seq_lens_cpu, for compatibility with some new attn metadata
    (such as GDN)."""

    num_computed_tokens_cpu: torch.Tensor
    """(batch_size,), the number of computed tokens for each request"""

    num_reqs: int
    """Number of requests"""
    num_actual_tokens: int
    """Total number of tokens in batch"""

    max_query_len: int
    """Max token number of request in batch"""

    decode_token_per_req: int
    """decode token number per request"""

    block_table_tensor: torch.Tensor

    slot_mapping: torch.Tensor

    actual_seq_lengths_q: list[int]

    positions: torch.Tensor = None

    attn_mask: torch.Tensor = None

    spec_attn_mask: torch.Tensor = None

    attn_state: Any = None

    enable_dbo_across_dp: bool = False

    is_only_prefill: bool = False

    graph_pad_size: int = -1

    # num_input_tokens refers to total number of tokens including
    # padding tokens. It is used to handle some padding operations.
    num_input_tokens: int = 0

    # NOTE: This is a temporary solution for rotary embedding in MLA
    cos: torch.Tensor = None
    sin: torch.Tensor = None


def split_decodes_and_prefills(
    common_attn_metadata: AscendCommonAttentionMetadata,
    decode_threshold: int = 1,
) -> tuple[int, int, int, int]:
    """
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: AscendCommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.

    Returns:
        num_decodes: The number of decode requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    """
    max_query_len = common_attn_metadata.max_query_len
    num_reqs = common_attn_metadata.num_reqs
    num_tokens = common_attn_metadata.num_actual_tokens
    query_start_loc = common_attn_metadata.query_start_loc_cpu

    if max_query_len <= decode_threshold:
        return num_reqs, 0, num_tokens, 0

    query_lens = query_start_loc[1:] - query_start_loc[:-1]
    is_prefill = query_lens > decode_threshold
    if not torch.any(is_prefill):
        return num_reqs, 0, num_tokens, 0

    first_prefill = is_prefill.int().argmax(dim=-1).item()
    num_decodes = first_prefill
    num_prefills = num_reqs - num_decodes
    num_decode_tokens = query_start_loc[first_prefill].item()
    num_prefill_tokens = num_tokens - num_decode_tokens
    return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)


def wait_for_kv_layer_from_connector(layer_name: str):
    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
        return

    connector = get_kv_transfer_group()

    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if attn_metadata is None:
        return
    # TODO: assert ascendMetadata
    connector.wait_for_layer_load(layer_name)


def maybe_save_kv_layer_to_connector(
    layer_name: str,
    kv_cache_layer: List[torch.Tensor],
):
    if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
        return

    connector = get_kv_transfer_group()

    forward_context: ForwardContext = get_forward_context()
    attn_metadata = forward_context.attn_metadata
    if attn_metadata is None:
        return
    # TODO: assert ascendMetadata
    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)


def round_up(val: int, align: int) -> int:
    if align == 0:
        return 0
    return -(val // -align) * align


def trans_rope_weight(weight, rope_dim):
    if rope_dim == 0:
        return weight.contiguous()
    nope_part = weight[..., :-rope_dim, :]
    rope_part = weight[..., -rope_dim:, :]
    reordered_rope_part = torch.cat(
        (rope_part[..., ::2, :], rope_part[..., 1::2, :]), dim=-2)
    return torch.cat((nope_part, reordered_rope_part), dim=-2).contiguous()


def transdata(nd_mat, block_size: tuple = (16, 16)):
    r = round_up(nd_mat.shape[0], block_size[0])
    c = round_up(nd_mat.shape[1], block_size[1])
    r_pad = r - nd_mat.shape[0]
    c_pad = c - nd_mat.shape[1]
    nd_mat = F.pad(nd_mat, (0, r_pad, 0, c_pad))
    nz_mat = torch.permute(
        torch.reshape(
            nd_mat,
            (r // block_size[0], block_size[0], c // block_size[1],
             block_size[1]),
        ),
        [2, 0, 1, 3],
    )
    nz_mat = torch.reshape(
        nz_mat,
        (nz_mat.shape[0], nz_mat.shape[1] * nz_mat.shape[2], nz_mat.shape[3]))
    return nz_mat
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00			`from dataclasses import dataclass`
[Feature]cpu offload connector (#1659) This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5aeb9254521023f97aca292b3478aa7ff485ffb2 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com> 2025-09-23 14:25:05 +08:00			`from typing import Any, List`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00
			`import torch`
adapt the mla_v1 with the `mla_preprocess` kernel (#3397) ### What this PR does / why we need it? This pull request integrates a new `mla_preprocess` kernel to create an optimized path for MLA (Multi-Layer Attention) decode operations on Ascend hardware, controlled by an environment flag. The changes include new utility functions for weight transformation, a method to prepare weights for the fused kernel, and logic to route decode-only batches to this new path. My review identified a critical bug in the `transdata` utility function where padding dimensions are swapped, which will lead to incorrect tensor shapes and kernel failures. Additionally, I've pointed out a high-severity maintainability issue in the trans_rope_weight function, which modifies its input in-place, and I have provided a pure-function alternative. ### Does this PR introduce _any_ user-facing change? No user-facing changes by default. User can enable the `mla_preprocess` kernel in model by enable the env-var `VLLM_ASCEND_ENABLE_MLAPO`. ### How was this patch tested? Dedicated Ascend kernels are not covered by our CI yet, so no extra automated tests were added. Future MLA-focused regression runs will cover this path. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Chen Chen <0109chenchen@gmail.com> 2025-10-15 10:34:25 +08:00			`import torch.nn.functional as F`
[Feature]cpu offload connector (#1659) This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5aeb9254521023f97aca292b3478aa7ff485ffb2 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com> 2025-09-23 14:25:05 +08:00			`from vllm.distributed.kv_transfer import (get_kv_transfer_group,`
			`has_kv_transfer_group,`
			`is_v1_kv_transfer_group)`
			`from vllm.forward_context import ForwardContext, get_forward_context`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00

			`@dataclass`
			`class AscendCommonAttentionMetadata:`
			`"""`
			`Per-batch attention metadata, shared across layers and backends.`
			`AttentionMetadataBuilder instances use it to construct per-layer metadata.`

			`For many of the tensors we keep both GPU and CPU versions.`
			`"""`

			`query_start_loc: torch.Tensor`
			`query_start_loc_cpu: torch.Tensor`
			`"""(batch_size + 1,), the start location of each request in query Tensor"""`

			`seq_lens_cpu: torch.Tensor`
			`"""(batch_size,), the length of each request including both computed tokens`
			`and newly scheduled tokens"""`

[New model] Qwen3-next support (#2917) ### What this PR does / why we need it? Add Qwen3-next support. ### Does this PR introduce _any_ user-facing change? Yes, users can use Qwen3 next. Related doc: https://github.com/vllm-project/vllm-ascend/pull/2916 the tutorial will be ready in [here](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) ### How was this patch tested? Doc CI passed Related: https://github.com/vllm-project/vllm-ascend/issues/2884 Co-Authored-By: Angazenn <supperccell@163.com> Co-Authored-By: zzzzwwjj <1183291235@qq.com> Co-Authored-By: MengqingCao <cmq0113@163.com> Co-Authored-By: linfeng-yuan <1102311262@qq.com> Co-Authored-By: hust17yixuan <303660421@qq.com> Co-Authored-By: SunnyLee219 <3294305115@qq.com> Co-Authored-By: maoxx241 <maoxx241@umn.edu> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/b834b4cbf1d5094affdf231df2be86920610d83e --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Angazenn <supperccell@163.com> Signed-off-by: Your Name <you@example.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Angazenn <supperccell@163.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: hust17yixuan <303660421@qq.com> 2025-09-16 01:17:42 +08:00			`seq_lens: torch.Tensor`
			`"""same to seq_lens_cpu, for compatibility with some new attn metadata`
			`(such as GDN)."""`

			`num_computed_tokens_cpu: torch.Tensor`
			`"""(batch_size,), the number of computed tokens for each request"""`

Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00			`num_reqs: int`
			`"""Number of requests"""`
			`num_actual_tokens: int`
			`"""Total number of tokens in batch"""`

			`max_query_len: int`
			`"""Max token number of request in batch"""`

			`decode_token_per_req: int`
			`"""decode token number per request"""`

			`block_table_tensor: torch.Tensor`

[Feat][Graph] Support `FULL_DECODE_ONLY` mode for GQA/MHA models (#2128) Note: This depends on [vLLM #25161](https://github.com/vllm-project/vllm/pull/25161) and the torch\_npu release from September 30. ### What this PR does / why we need it? This pull request adds `FULL_DECODE_ONLY` mode for GQA/MHA models (MLA models like DeepSeek V3/R1 are not included). Key improvements include: * Reduced dispatch latency: By replaying the entire model execution graph at once, we cut overhead compared with multiple smaller replays. * Stabilized multi-device performance: Captureing the whole model as one static graph also mitigates the dispatch fluctuations across devices. * Stream/resource savings: Consolidating graph captures frees up streams, allowing more graphs to be captured. Known issues: 1. `_npu_paged_attention` currently manages its own workspace in `torch_npu`, which can deadlock when synchronizing during graph replay — we’re working on a fix. There may be other corner cases. This PR is the first in a planned series; we’ll continue to iterate and address remaining issues in follow-ups. This is essentially a port of #1503 and #1677, but includes two major changes: 1. Let `graph_dispatcher` decide the graph mode instead of hard-coding it in the backend, which decouples Full Graph and Piecewise Graph and could make it possible to remove dynamo. 2. Adapt to the new `attn_group` logic, but leave a small hack in `update_graph_params`; multi-attention models may or may not be fully supported yet. ### Does this PR introduce _any_ user-facing change? ```python compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", }, ``` ### How was this patch tested? Tests included. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/9607d5eb449711b349d4c2bee0a9c94afcc7ed14 --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com> 2025-09-22 17:14:28 +08:00			`slot_mapping: torch.Tensor`
Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00
			`actual_seq_lengths_q: list[int]`

			`positions: torch.Tensor = None`

			`attn_mask: torch.Tensor = None`

			`spec_attn_mask: torch.Tensor = None`

			`attn_state: Any = None`

			`enable_dbo_across_dp: bool = False`

			`is_only_prefill: bool = False`

			`graph_pad_size: int = -1`

[Core]Append padding logic for Attention (#3256) ### What this PR does / why we need it? This PR aims to add padding logic to seq_lens、block_tables when running in full decode scenario. Before this PR, the number of input tokens with padding might exceeds corresponding seq_lens. For example, when running in full decode scenario: ``` input_ids : [1, 3, 0, 0] seq_lens: [2, 1] query_start_loc: [0, 1, 2] ``` Here, `input_ids` is padded by 2 tokens while `seq_lens`/`query_start_loc` are not. The mismatch between `input_ids` and `seq_lens`/`query_start_loc` might cause some potential bugs. This PR would change it into : ``` input_ids : [1, 3, 0, 0] seq_lens: [2, 1, 1, 1] query_start_loc: [0, 1, 2, 3, 4] ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: Angazenn <supperccell@163.com> 2025-10-17 21:56:01 +08:00			`# num_input_tokens refers to total number of tokens including`
			`# padding tokens. It is used to handle some padding operations.`
			`num_input_tokens: int = 0`

[Feat][Graph]Support FULL_DECEDE_ONLY mode for MLA models (#3125) ### What this PR does / why we need it? Adds support for capturing the Multi-Layer Attention (MLA) decode operation into an ACL graph. This improves performance by compiling the attention kernel for single-token decoding. Key changes include: - Implementing the graph capture logic for the MLA kernel, including workspace management and parameter updates. - Modifying the rotary embedding (RoPE) handling to use pre-allocated tensors, which is a requirement for graph capture. - Adding a `build_for_graph_capture` method to the MLA metadata builder to create dummy metadata during the graph compilation phase. Known issues: - Currently, MTP is not supported in FULL_DECEDE_ONLY mode -- we're working on a fix - We are preparing to remove update_mla_attn_params with auto_dispatch_capture ### Does this PR introduce _any_ user-facing change? compilation_config={ "cudagraph_mode": "FULL_DECODE_ONLY", }, ### How was this patch tested? - vLLM version: v0.11.0 --------- Signed-off-by: panchao-hub <315134829@qq.com> Signed-off-by: p00465316 <panchao13@huawei.com> Co-authored-by: p00465316 <panchao13@huawei.com> Co-authored-by: Yizhou Liu <liu_yizhou@outlook.com> 2025-10-10 16:31:20 +08:00			`# NOTE: This is a temporary solution for rotary embedding in MLA`
			`cos: torch.Tensor = None`
			`sin: torch.Tensor = None`

Fix some ci issue and refactor modelrunner (#2445) ### What this PR does / why we need it? Fix some ci issue and refactor modelrunner ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with existing test. - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/4d9c61993ac4209c97b3afef237b2387f2cd9b97 --------- Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> 2025-08-20 09:01:04 +08:00
			`def split_decodes_and_prefills(`
			`common_attn_metadata: AscendCommonAttentionMetadata,`
			`decode_threshold: int = 1,`
			`) -> tuple[int, int, int, int]:`
			`"""`
			`Assuming a reordered batch, finds the boundary between prefill and decode`
			`requests.`

			`Args:`
			`common_attn_metadata: AscendCommonAttentionMetadata object containing the`
			`batch metadata.`
			`decode_threshold: The maximum query length to be considered a decode.`

			`Returns:`
			`num_decodes: The number of decode requests.`
			`num_prefills: The number of prefill requests.`
			`num_decode_tokens: The number of tokens in the decode requests.`
			`num_prefill_tokens: The number of tokens in the prefill requests.`
			`"""`
			`max_query_len = common_attn_metadata.max_query_len`
			`num_reqs = common_attn_metadata.num_reqs`
			`num_tokens = common_attn_metadata.num_actual_tokens`
			`query_start_loc = common_attn_metadata.query_start_loc_cpu`

			`if max_query_len <= decode_threshold:`
			`return num_reqs, 0, num_tokens, 0`

			`query_lens = query_start_loc[1:] - query_start_loc[:-1]`
			`is_prefill = query_lens > decode_threshold`
			`if not torch.any(is_prefill):`
			`return num_reqs, 0, num_tokens, 0`

			`first_prefill = is_prefill.int().argmax(dim=-1).item()`
			`num_decodes = first_prefill`
			`num_prefills = num_reqs - num_decodes`
			`num_decode_tokens = query_start_loc[first_prefill].item()`
			`num_prefill_tokens = num_tokens - num_decode_tokens`
			`return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens)`
[Feature]cpu offload connector (#1659) This PR implements cpu offload connector to enable NPU kv cache offload to host DRAM. - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5aeb9254521023f97aca292b3478aa7ff485ffb2 Signed-off-by: lidenghui <lidenghui1110@gmail.com> Signed-off-by: AlvisGong <gwly0401@163.com> Signed-off-by: CalvinXKY <kyxiezju@163.com> Co-authored-by: AlvisGong <gwly0401@163.com> 2025-09-23 14:25:05 +08:00

			`def wait_for_kv_layer_from_connector(layer_name: str):`
			`if not has_kv_transfer_group() or not is_v1_kv_transfer_group():`
			`return`

			`connector = get_kv_transfer_group()`

			`forward_context: ForwardContext = get_forward_context()`
			`attn_metadata = forward_context.attn_metadata`
			`if attn_metadata is None:`
			`return`
			`# TODO: assert ascendMetadata`
			`connector.wait_for_layer_load(layer_name)`


			`def maybe_save_kv_layer_to_connector(`
			`layer_name: str,`
			`kv_cache_layer: List[torch.Tensor],`
			`):`
			`if not has_kv_transfer_group() or not is_v1_kv_transfer_group():`
			`return`

			`connector = get_kv_transfer_group()`

			`forward_context: ForwardContext = get_forward_context()`
			`attn_metadata = forward_context.attn_metadata`
			`if attn_metadata is None:`
			`return`
			`# TODO: assert ascendMetadata`
			`connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)`
fix pagedattention to support fullgraph. (#3436) ### What this PR does / why we need it? Calculate in advance the workspace memory size needed for the PagedAttention operator to avoid deadlocks during resource cleanup. This PR requires torch_npu version 0920 or newer. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> 2025-10-14 16:10:09 +08:00

adapt the mla_v1 with the `mla_preprocess` kernel (#3397) ### What this PR does / why we need it? This pull request integrates a new `mla_preprocess` kernel to create an optimized path for MLA (Multi-Layer Attention) decode operations on Ascend hardware, controlled by an environment flag. The changes include new utility functions for weight transformation, a method to prepare weights for the fused kernel, and logic to route decode-only batches to this new path. My review identified a critical bug in the `transdata` utility function where padding dimensions are swapped, which will lead to incorrect tensor shapes and kernel failures. Additionally, I've pointed out a high-severity maintainability issue in the trans_rope_weight function, which modifies its input in-place, and I have provided a pure-function alternative. ### Does this PR introduce _any_ user-facing change? No user-facing changes by default. User can enable the `mla_preprocess` kernel in model by enable the env-var `VLLM_ASCEND_ENABLE_MLAPO`. ### How was this patch tested? Dedicated Ascend kernels are not covered by our CI yet, so no extra automated tests were added. Future MLA-focused regression runs will cover this path. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Chen Chen <0109chenchen@gmail.com> 2025-10-15 10:34:25 +08:00			`def round_up(val: int, align: int) -> int:`
			`if align == 0:`
			`return 0`
			`return -(val // -align) * align`


			`def trans_rope_weight(weight, rope_dim):`
			`if rope_dim == 0:`
			`return weight.contiguous()`
			`nope_part = weight[..., :-rope_dim, :]`
			`rope_part = weight[..., -rope_dim:, :]`
			`reordered_rope_part = torch.cat(`
			`(rope_part[..., ::2, :], rope_part[..., 1::2, :]), dim=-2)`
			`return torch.cat((nope_part, reordered_rope_part), dim=-2).contiguous()`


			`def transdata(nd_mat, block_size: tuple = (16, 16)):`
			`r = round_up(nd_mat.shape[0], block_size[0])`
			`c = round_up(nd_mat.shape[1], block_size[1])`
			`r_pad = r - nd_mat.shape[0]`
			`c_pad = c - nd_mat.shape[1]`
			`nd_mat = F.pad(nd_mat, (0, r_pad, 0, c_pad))`
			`nz_mat = torch.permute(`
			`torch.reshape(`
			`nd_mat,`
			`(r // block_size[0], block_size[0], c // block_size[1],`
			`block_size[1]),`
			`),`
			`[2, 0, 1, 3],`
			`)`
			`nz_mat = torch.reshape(`
			`nz_mat,`
			`(nz_mat.shape[0], nz_mat.shape[1] * nz_mat.shape[2], nz_mat.shape[3]))`
			`return nz_mat`