xc-llm-ascend/vllm_ascend/patch/platform/patch_mamba_config.py

# mypy: ignore-errors
import vllm.model_executor.models.config
from vllm.logger import init_logger
from vllm.model_executor.models import ModelRegistry
from vllm.model_executor.models.config import MambaModelConfig
from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec


@classmethod
def verify_and_update_config(cls, vllm_config) -> None:
    """
    Ensure that page size of attention layers is greater than or
    equal to the mamba layers. If not, automatically set the attention
    block size to ensure that it is. If the attention page size is
    strictly greater than the mamba page size, we pad the mamba page size
    to make them equal.

    Args:
        vllm_config: vLLM Config
    """
    logger = init_logger(__name__)
    # Enable FULL_AND_PIECEWISE by default
    MambaModelConfig.verify_and_update_config(vllm_config)

    cache_config = vllm_config.cache_config
    model_config = vllm_config.model_config
    parallel_config = vllm_config.parallel_config

    if cache_config.cache_dtype == "auto":
        kv_cache_dtype = model_config.dtype
    else:
        kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]

    # get attention page size (for 1 token)
    attn_page_size_1_token = FullAttentionSpec(
        block_size=1,
        num_kv_heads=model_config.get_num_kv_heads(parallel_config),
        head_size=model_config.get_head_size(),
        dtype=kv_cache_dtype).page_size_bytes

    model_cls, _ = ModelRegistry.resolve_model_cls(
        model_config.architecture,
        model_config=model_config,
    )

    # get mamba page size
    mamba_page_size = MambaSpec(
        shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
        dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
        block_size=model_config.max_model_len,
    ).page_size_bytes

    block_alignment_bytes = 128

    # some attention backends (e.g. FA) only support setting
    # block size to multiple of 16, so let's suggest a value
    # that would work (note: FA is currently not compatible
    # with mamba layers, use FlashInfer instead).
    attn_block_size = block_alignment_bytes * cdiv(
        mamba_page_size, block_alignment_bytes * attn_page_size_1_token)

    # override attention block size if either (a) the
    # user has not set it or (b) the user has set it
    # too small.
    if (cache_config.block_size is None
            or cache_config.block_size < attn_block_size):
        cache_config.block_size = attn_block_size
        logger.info(
            "Setting attention block size to %d tokens "
            "to ensure that attention page size is >= mamba page size.",
            attn_block_size)

    # compute new attention page size
    attn_page_size = \
        cache_config.block_size * attn_page_size_1_token

    assert attn_page_size >= mamba_page_size

    if attn_page_size == mamba_page_size:
        # don't need to pad mamba page size
        return

    # pad mamba page size to exactly match attention
    if (cache_config.mamba_page_size_padded is None
            or cache_config.mamba_page_size_padded != attn_page_size):
        cache_config.mamba_page_size_padded = (attn_page_size)
        mamba_padding_pct = 100 * (attn_page_size -
                                   mamba_page_size) / mamba_page_size
        logger.info(
            "Padding mamba page size by %.2f%% to ensure "
            "that mamba page size and attention page size are "
            "exactly equal.", mamba_padding_pct)


vllm.model_executor.models.config.HybridAttentionMambaModelConfig.verify_and_update_config = verify_and_update_config
[New model] Qwen3-next support (#2917) ### What this PR does / why we need it? Add Qwen3-next support. ### Does this PR introduce _any_ user-facing change? Yes, users can use Qwen3 next. Related doc: https://github.com/vllm-project/vllm-ascend/pull/2916 the tutorial will be ready in [here](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) ### How was this patch tested? Doc CI passed Related: https://github.com/vllm-project/vllm-ascend/issues/2884 Co-Authored-By: Angazenn <supperccell@163.com> Co-Authored-By: zzzzwwjj <1183291235@qq.com> Co-Authored-By: MengqingCao <cmq0113@163.com> Co-Authored-By: linfeng-yuan <1102311262@qq.com> Co-Authored-By: hust17yixuan <303660421@qq.com> Co-Authored-By: SunnyLee219 <3294305115@qq.com> Co-Authored-By: maoxx241 <maoxx241@umn.edu> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/b834b4cbf1d5094affdf231df2be86920610d83e --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Angazenn <supperccell@163.com> Signed-off-by: Your Name <you@example.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Angazenn <supperccell@163.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: hust17yixuan <303660421@qq.com> 2025-09-16 01:17:42 +08:00			`# mypy: ignore-errors`
			`import vllm.model_executor.models.config`
			`from vllm.logger import init_logger`
			`from vllm.model_executor.models import ModelRegistry`
			`from vllm.model_executor.models.config import MambaModelConfig`
Drop 0.11.0 support (#4377) There is a lot hack code for v0.11.0, which makes the code hard to upgrade to newer vLLM version. Since v0.11.0 will release soon. Let's drop v0.11.0 support first. Then we'll upgrade to v0.11.2 soon. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> 2025-11-24 17:08:20 +08:00			`from vllm.utils.math_utils import cdiv`
			`from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE`
[New model] Qwen3-next support (#2917) ### What this PR does / why we need it? Add Qwen3-next support. ### Does this PR introduce _any_ user-facing change? Yes, users can use Qwen3 next. Related doc: https://github.com/vllm-project/vllm-ascend/pull/2916 the tutorial will be ready in [here](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) ### How was this patch tested? Doc CI passed Related: https://github.com/vllm-project/vllm-ascend/issues/2884 Co-Authored-By: Angazenn <supperccell@163.com> Co-Authored-By: zzzzwwjj <1183291235@qq.com> Co-Authored-By: MengqingCao <cmq0113@163.com> Co-Authored-By: linfeng-yuan <1102311262@qq.com> Co-Authored-By: hust17yixuan <303660421@qq.com> Co-Authored-By: SunnyLee219 <3294305115@qq.com> Co-Authored-By: maoxx241 <maoxx241@umn.edu> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/b834b4cbf1d5094affdf231df2be86920610d83e --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Angazenn <supperccell@163.com> Signed-off-by: Your Name <you@example.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Angazenn <supperccell@163.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: hust17yixuan <303660421@qq.com> 2025-09-16 01:17:42 +08:00			`from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec`


			`@classmethod`
			`def verify_and_update_config(cls, vllm_config) -> None:`
			`"""`
			`Ensure that page size of attention layers is greater than or`
			`equal to the mamba layers. If not, automatically set the attention`
			`block size to ensure that it is. If the attention page size is`
			`strictly greater than the mamba page size, we pad the mamba page size`
			`to make them equal.`

			`Args:`
			`vllm_config: vLLM Config`
			`"""`
			`logger = init_logger(__name__)`
			`# Enable FULL_AND_PIECEWISE by default`
			`MambaModelConfig.verify_and_update_config(vllm_config)`

			`cache_config = vllm_config.cache_config`
			`model_config = vllm_config.model_config`
			`parallel_config = vllm_config.parallel_config`

			`if cache_config.cache_dtype == "auto":`
			`kv_cache_dtype = model_config.dtype`
			`else:`
			`kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]`

			`# get attention page size (for 1 token)`
			`attn_page_size_1_token = FullAttentionSpec(`
			`block_size=1,`
			`num_kv_heads=model_config.get_num_kv_heads(parallel_config),`
			`head_size=model_config.get_head_size(),`
[KVCache] Refactor KVCache as page_size_bytes is ineffective (#3438) ### What this PR does / why we need it? Refactor KVCache as page_size_bytes is ineffective. 1. Currently the `AttentionSpec` is patched, but the `page_size_bytes` is still using that in vLLM in runtime, thus the patch is not working actually. Thus this pr removes the patch on `AttentionSpec`, and will do the final fix in vLLM. 2. Use `MLAAttentionSpec` instead of `FullAttentionSpec` to reduce `page_size_bytes` of spec, so that num_blocks in spec could double ### How was this patch tested? Test pass with Qwen3-Next and DeepSeek-V3.2-Exp - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: MengqingCao <cmq0113@163.com> 2025-10-14 21:28:41 +08:00			`dtype=kv_cache_dtype).page_size_bytes`
[New model] Qwen3-next support (#2917) ### What this PR does / why we need it? Add Qwen3-next support. ### Does this PR introduce _any_ user-facing change? Yes, users can use Qwen3 next. Related doc: https://github.com/vllm-project/vllm-ascend/pull/2916 the tutorial will be ready in [here](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) ### How was this patch tested? Doc CI passed Related: https://github.com/vllm-project/vllm-ascend/issues/2884 Co-Authored-By: Angazenn <supperccell@163.com> Co-Authored-By: zzzzwwjj <1183291235@qq.com> Co-Authored-By: MengqingCao <cmq0113@163.com> Co-Authored-By: linfeng-yuan <1102311262@qq.com> Co-Authored-By: hust17yixuan <303660421@qq.com> Co-Authored-By: SunnyLee219 <3294305115@qq.com> Co-Authored-By: maoxx241 <maoxx241@umn.edu> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/b834b4cbf1d5094affdf231df2be86920610d83e --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Angazenn <supperccell@163.com> Signed-off-by: Your Name <you@example.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Angazenn <supperccell@163.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: hust17yixuan <303660421@qq.com> 2025-09-16 01:17:42 +08:00
			`model_cls, _ = ModelRegistry.resolve_model_cls(`
			`model_config.architecture,`
			`model_config=model_config,`
			`)`

			`# get mamba page size`
			`mamba_page_size = MambaSpec(`
			`shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),`
			`dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),`
			`block_size=model_config.max_model_len,`
			`).page_size_bytes`

[BugFix][main] Adapted to torch_npu.npu_fused_infer_attention_score (#4025) ### What this PR does / why we need it? Fixes a compatible bug with `torch_npu.npu_fused_infer_attention_score` which is discribed in https://github.com/vllm-project/vllm-ascend/issues/4020. @momo609 tells us this solution. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? The environment is same with this issue, https://github.com/vllm-project/vllm-ascend/issues/4020. We modify the code according to https://github.com/vllm-project/vllm-ascend/pull/3918. And run below codes: ```python # run with Qwen3-next-mtp prompts = [ "Who are you?", ] sampling_params = SamplingParams(temperature=0.0, top_p=0.95, top_k=40, max_tokens=128) llm = LLM(model="/home/model/Qwen3-Next-80B-A3B-Instruct", tensor_parallel_size=4, enforce_eager=True, distributed_executor_backend="mp", gpu_memory_utilization=0.7, speculative_config={ "method": "qwen3_next_mtp", "num_speculative_tokens": 1, }, max_model_len=4096) outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` Outputs: ```text Prompt: 'Who are you?', Generated text: ' I am Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. I am designed to answer questions, create text such as stories, official documents, emails, scripts, and more, as well as perform logical reasoning, programming, and other tasks. If you have any questions or need assistance, feel free to let me know anytime!' ``` Now, `torch_npu.npu_fused_infer_attention_score` is compatible with Qwen3-Next. - vLLM version: v0.11.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83f478bb19489b41e9d208b47b4bb5a95ac171ac Signed-off-by: drslark <slarksblood@qq.com> 2025-11-06 22:00:24 +08:00			`block_alignment_bytes = 128`
[New model] Qwen3-next support (#2917) ### What this PR does / why we need it? Add Qwen3-next support. ### Does this PR introduce _any_ user-facing change? Yes, users can use Qwen3 next. Related doc: https://github.com/vllm-project/vllm-ascend/pull/2916 the tutorial will be ready in [here](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) ### How was this patch tested? Doc CI passed Related: https://github.com/vllm-project/vllm-ascend/issues/2884 Co-Authored-By: Angazenn <supperccell@163.com> Co-Authored-By: zzzzwwjj <1183291235@qq.com> Co-Authored-By: MengqingCao <cmq0113@163.com> Co-Authored-By: linfeng-yuan <1102311262@qq.com> Co-Authored-By: hust17yixuan <303660421@qq.com> Co-Authored-By: SunnyLee219 <3294305115@qq.com> Co-Authored-By: maoxx241 <maoxx241@umn.edu> - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/b834b4cbf1d5094affdf231df2be86920610d83e --------- Signed-off-by: MengqingCao <cmq0113@163.com> Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: Angazenn <supperccell@163.com> Signed-off-by: Your Name <you@example.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Signed-off-by: linfeng-yuan <1102311262@qq.com> Signed-off-by: hust17yixuan <303660421@qq.com> Co-authored-by: MengqingCao <cmq0113@163.com> Co-authored-by: Angazenn <supperccell@163.com> Co-authored-by: Your Name <you@example.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: linfeng-yuan <1102311262@qq.com> Co-authored-by: hust17yixuan <303660421@qq.com> 2025-09-16 01:17:42 +08:00
			`# some attention backends (e.g. FA) only support setting`
			`# block size to multiple of 16, so let's suggest a value`
			`# that would work (note: FA is currently not compatible`
			`# with mamba layers, use FlashInfer instead).`
			`attn_block_size = block_alignment_bytes * cdiv(`
			`mamba_page_size, block_alignment_bytes * attn_page_size_1_token)`

			`# override attention block size if either (a) the`
			`# user has not set it or (b) the user has set it`
			`# too small.`
			`if (cache_config.block_size is None`
			`or cache_config.block_size < attn_block_size):`
			`cache_config.block_size = attn_block_size`
			`logger.info(`
			`"Setting attention block size to %d tokens "`
			`"to ensure that attention page size is >= mamba page size.",`
			`attn_block_size)`

			`# compute new attention page size`
			`attn_page_size = \`
			`cache_config.block_size * attn_page_size_1_token`

			`assert attn_page_size >= mamba_page_size`

			`if attn_page_size == mamba_page_size:`
			`# don't need to pad mamba page size`
			`return`

			`# pad mamba page size to exactly match attention`
			`if (cache_config.mamba_page_size_padded is None`
			`or cache_config.mamba_page_size_padded != attn_page_size):`
			`cache_config.mamba_page_size_padded = (attn_page_size)`
			`mamba_padding_pct = 100 * (attn_page_size -`
			`mamba_page_size) / mamba_page_size`
			`logger.info(`
			`"Padding mamba page size by %.2f%% to ensure "`
			`"that mamba page size and attention page size are "`
			`"exactly equal.", mamba_padding_pct)`


			`vllm.model_executor.models.config.HybridAttentionMambaModelConfig.verify_and_update_config = verify_and_update_config`