[Bugfix] Support Kimi-K2.5 models (#6755)

### What this PR does / why we need it? This PR supports the Kimi-K2.5 models on the NPU of bf16 and w4a8 weights. The corresponding PR in the vllm community has been merged: https://github.com/vllm-project/vllm/pull/34501 ### Does this PR introduce _any_ user-facing change? - No. ### How was this patch tested? We test the Kimi-K2.5 weights. The weights path: https://modelscope.cn/models/Eco-Tech/Kimi-K2.5-W4A8 Successfully ran on 910B NPU using vllm-ascend by the w4a8 weights. - vLLM version: v0.15.0 - vLLM main: 9562912cea --------- Signed-off-by: LoganJane <LoganJane73@hotmail.com>
2026-02-25 14:51:46 +08:00
parent 4efd362bac
commit ed051737e9
3 changed files with 79 additions and 2 deletions
--- a/vllm_ascend/patch/worker/init.py
+++ b/vllm_ascend/patch/worker/init.py
@@ -35,3 +35,4 @@ import vllm_ascend.patch.worker.patch_v2_eagle  # noqa
 import vllm_ascend.patch.worker.patch_v2_uva  # noqa
 import vllm_ascend.patch.worker.patch_huanyuan_vl  # noqa
 import vllm_ascend.patch.worker.patch_npugraph_ex_triton  # noqa
 import vllm_ascend.patch.worker.patch_kimi_k25  # noqa
--- a/vllm_ascend/patch/worker/patch_kimi_k25.py
+++ b/vllm_ascend/patch/worker/patch_kimi_k25.py
@@ -0,0 +1,71 @@
 #
 # Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
 # This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from vllm.logger import init_logger
 from vllm.model_executor.models.kimi_k25_vit import Learnable2DInterpPosEmbDivided_fixed, get_rope_shape_decorate
 logger = init_logger(__name__)
@get_rope_shape_decorate
 def get_rope_shape(org, interpolation_mode, shape):
    return (
        F.interpolate(
            org.permute((2, 0, 1)).unsqueeze(0),
            size=shape,
            mode=interpolation_mode,
        )
        .squeeze(0)
        .permute((1, 2, 0))
        .flatten(end_dim=1)
    )
 class AscendLearnable2DInterpPosEmbDivided_fixed(nn.Module):
    def forward(self, x: torch.Tensor, grid_thws: torch.Tensor) -> torch.Tensor:
        pos_embs = []
        for t, h, w in grid_thws.tolist():
            x_device = x.device
            x_dtype = x.dtype
            assert t <= self.num_frames, f"t:{t} > self.num_frames:{self.num_frames}"
            if (h, w) == self.weight.shape[:-1]:
                pos_emb_2d = self.weight.flatten(end_dim=1)
            else:
                weight_fp32 = self.weight.to(dtype=torch.float32)
                weight_cpu = weight_fp32.to("cpu")
                pos_emb_2d = get_rope_shape(
                    weight_cpu,
                    interpolation_mode=self.interpolation_mode,
                    shape=(h, w),
                )
                pos_emb_2d = pos_emb_2d.to(x_device, dtype=x_dtype)
            if t == 1:
                pos_emb_3d = pos_emb_2d
            else:
                pos_emb_3d = pos_emb_2d.unsqueeze(0).repeat(t, 1, 1) + self.time_weight[0:t]
            pos_embs.append(pos_emb_3d.reshape(-1, pos_emb_3d.shape[-1]))
        out = x + torch.cat(pos_embs)
        return out
 Learnable2DInterpPosEmbDivided_fixed.forward = AscendLearnable2DInterpPosEmbDivided_fixed.forward
--- a/vllm_ascend/quantization/modelslim_config.py
+++ b/vllm_ascend/quantization/modelslim_config.py
@@ -54,6 +54,10 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
        "language_model.lm_head.": "lm_head.",
        "language_model.model.": "model.language_model.",
    },
    "kimi_k25": {
        "mm_projector.linear_1": "mm_projector.proj.0",
        "mm_projector.linear_2": "mm_projector.proj.2",
    },
 }
 # key: model_type
@@ -393,6 +397,7 @@ class AscendModelSlimConfig(QuantizationConfig):
        else:
            from vllm.model_executor.layers.attention import Attention
        if model_type != "kimi_k2":
            if prefix.startswith("language_model"):
                prefix = prefix.split(".", 1)[-1]
        if isinstance(layer, LinearBase):