From ed051737e9984216c93cbea8790710c3485b3cee Mon Sep 17 00:00:00 2001 From: LoganJane <42287016+LoganJane@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:51:46 +0800 Subject: [PATCH] [Bugfix] Support Kimi-K2.5 models (#6755) ### What this PR does / why we need it? This PR supports the Kimi-K2.5 models on the NPU of bf16 and w4a8 weights. The corresponding PR in the vllm community has been merged: https://github.com/vllm-project/vllm/pull/34501 ### Does this PR introduce _any_ user-facing change? - No. ### How was this patch tested? We test the Kimi-K2.5 weights. The weights path: https://modelscope.cn/models/Eco-Tech/Kimi-K2.5-W4A8 Successfully ran on 910B NPU using vllm-ascend by the w4a8 weights. - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007 --------- Signed-off-by: LoganJane --- vllm_ascend/patch/worker/__init__.py | 1 + vllm_ascend/patch/worker/patch_kimi_k25.py | 71 ++++++++++++++++++++ vllm_ascend/quantization/modelslim_config.py | 9 ++- 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 vllm_ascend/patch/worker/patch_kimi_k25.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index f2adab0c..e916aee8 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -35,3 +35,4 @@ import vllm_ascend.patch.worker.patch_v2_eagle # noqa import vllm_ascend.patch.worker.patch_v2_uva # noqa import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa +import vllm_ascend.patch.worker.patch_kimi_k25 # noqa diff --git a/vllm_ascend/patch/worker/patch_kimi_k25.py b/vllm_ascend/patch/worker/patch_kimi_k25.py new file mode 100644 index 00000000..05740972 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_kimi_k25.py @@ -0,0 +1,71 @@ +# +# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger +from vllm.model_executor.models.kimi_k25_vit import Learnable2DInterpPosEmbDivided_fixed, get_rope_shape_decorate + +logger = init_logger(__name__) + + +@get_rope_shape_decorate +def get_rope_shape(org, interpolation_mode, shape): + return ( + F.interpolate( + org.permute((2, 0, 1)).unsqueeze(0), + size=shape, + mode=interpolation_mode, + ) + .squeeze(0) + .permute((1, 2, 0)) + .flatten(end_dim=1) + ) + + +class AscendLearnable2DInterpPosEmbDivided_fixed(nn.Module): + def forward(self, x: torch.Tensor, grid_thws: torch.Tensor) -> torch.Tensor: + pos_embs = [] + for t, h, w in grid_thws.tolist(): + x_device = x.device + x_dtype = x.dtype + assert t <= self.num_frames, f"t:{t} > self.num_frames:{self.num_frames}" + if (h, w) == self.weight.shape[:-1]: + pos_emb_2d = self.weight.flatten(end_dim=1) + else: + weight_fp32 = self.weight.to(dtype=torch.float32) + weight_cpu = weight_fp32.to("cpu") + pos_emb_2d = get_rope_shape( + weight_cpu, + interpolation_mode=self.interpolation_mode, + shape=(h, w), + ) + pos_emb_2d = pos_emb_2d.to(x_device, dtype=x_dtype) + + if t == 1: + pos_emb_3d = pos_emb_2d + else: + pos_emb_3d = pos_emb_2d.unsqueeze(0).repeat(t, 1, 1) + self.time_weight[0:t] + + pos_embs.append(pos_emb_3d.reshape(-1, pos_emb_3d.shape[-1])) + + out = x + torch.cat(pos_embs) + return out + + +Learnable2DInterpPosEmbDivided_fixed.forward = AscendLearnable2DInterpPosEmbDivided_fixed.forward diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 307de36c..0541bfb4 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -54,6 +54,10 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, + "kimi_k25": { + "mm_projector.linear_1": "mm_projector.proj.0", + "mm_projector.linear_2": "mm_projector.proj.2", + }, } # key: model_type @@ -393,8 +397,9 @@ class AscendModelSlimConfig(QuantizationConfig): else: from vllm.model_executor.layers.attention import Attention - if prefix.startswith("language_model"): - prefix = prefix.split(".", 1)[-1] + if model_type != "kimi_k2": + if prefix.startswith("language_model"): + prefix = prefix.split(".", 1)[-1] if isinstance(layer, LinearBase): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import