From ed051737e9984216c93cbea8790710c3485b3cee Mon Sep 17 00:00:00 2001
From: LoganJane <42287016+LoganJane@users.noreply.github.com>
Date: Wed, 25 Feb 2026 14:51:46 +0800
Subject: [PATCH] [Bugfix] Support Kimi-K2.5 models (#6755)

### What this PR does / why we need it?
This PR supports the Kimi-K2.5 models on the NPU of bf16 and w4a8
weights.
The corresponding PR in the vllm community has been merged:
https://github.com/vllm-project/vllm/pull/34501

### Does this PR introduce _any_ user-facing change?
- No.

### How was this patch tested?
We test the Kimi-K2.5 weights. The weights path:
https://modelscope.cn/models/Eco-Tech/Kimi-K2.5-W4A8
Successfully ran on 910B NPU using vllm-ascend by the w4a8 weights.

- vLLM version: v0.15.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/9562912cead1f11e8540fb91306c5cbda66f0007

---------

Signed-off-by: LoganJane <LoganJane73@hotmail.com>
---
 vllm_ascend/patch/worker/__init__.py         |  1 +
 vllm_ascend/patch/worker/patch_kimi_k25.py   | 71 ++++++++++++++++++++
 vllm_ascend/quantization/modelslim_config.py |  9 ++-
 3 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 vllm_ascend/patch/worker/patch_kimi_k25.py

diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index f2adab0c..e916aee8 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -35,3 +35,4 @@ import vllm_ascend.patch.worker.patch_v2_eagle  # noqa
 import vllm_ascend.patch.worker.patch_v2_uva  # noqa
 import vllm_ascend.patch.worker.patch_huanyuan_vl  # noqa
 import vllm_ascend.patch.worker.patch_npugraph_ex_triton  # noqa
+import vllm_ascend.patch.worker.patch_kimi_k25  # noqa
diff --git a/vllm_ascend/patch/worker/patch_kimi_k25.py b/vllm_ascend/patch/worker/patch_kimi_k25.py
new file mode 100644
index 00000000..05740972
--- /dev/null
+++ b/vllm_ascend/patch/worker/patch_kimi_k25.py
@@ -0,0 +1,71 @@
+#
+# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from vllm.logger import init_logger
+from vllm.model_executor.models.kimi_k25_vit import Learnable2DInterpPosEmbDivided_fixed, get_rope_shape_decorate
+
+logger = init_logger(__name__)
+
+
+@get_rope_shape_decorate
+def get_rope_shape(org, interpolation_mode, shape):
+    return (
+        F.interpolate(
+            org.permute((2, 0, 1)).unsqueeze(0),
+            size=shape,
+            mode=interpolation_mode,
+        )
+        .squeeze(0)
+        .permute((1, 2, 0))
+        .flatten(end_dim=1)
+    )
+
+
+class AscendLearnable2DInterpPosEmbDivided_fixed(nn.Module):
+    def forward(self, x: torch.Tensor, grid_thws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for t, h, w in grid_thws.tolist():
+            x_device = x.device
+            x_dtype = x.dtype
+            assert t <= self.num_frames, f"t:{t} > self.num_frames:{self.num_frames}"
+            if (h, w) == self.weight.shape[:-1]:
+                pos_emb_2d = self.weight.flatten(end_dim=1)
+            else:
+                weight_fp32 = self.weight.to(dtype=torch.float32)
+                weight_cpu = weight_fp32.to("cpu")
+                pos_emb_2d = get_rope_shape(
+                    weight_cpu,
+                    interpolation_mode=self.interpolation_mode,
+                    shape=(h, w),
+                )
+                pos_emb_2d = pos_emb_2d.to(x_device, dtype=x_dtype)
+
+            if t == 1:
+                pos_emb_3d = pos_emb_2d
+            else:
+                pos_emb_3d = pos_emb_2d.unsqueeze(0).repeat(t, 1, 1) + self.time_weight[0:t]
+
+            pos_embs.append(pos_emb_3d.reshape(-1, pos_emb_3d.shape[-1]))
+
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+Learnable2DInterpPosEmbDivided_fixed.forward = AscendLearnable2DInterpPosEmbDivided_fixed.forward
diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py
index 307de36c..0541bfb4 100644
--- a/vllm_ascend/quantization/modelslim_config.py
+++ b/vllm_ascend/quantization/modelslim_config.py
@@ -54,6 +54,10 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
         "language_model.lm_head.": "lm_head.",
         "language_model.model.": "model.language_model.",
     },
+    "kimi_k25": {
+        "mm_projector.linear_1": "mm_projector.proj.0",
+        "mm_projector.linear_2": "mm_projector.proj.2",
+    },
 }
 
 # key: model_type
@@ -393,8 +397,9 @@ class AscendModelSlimConfig(QuantizationConfig):
         else:
             from vllm.model_executor.layers.attention import Attention
 
-        if prefix.startswith("language_model"):
-            prefix = prefix.split(".", 1)[-1]
+        if model_type != "kimi_k2":
+            if prefix.startswith("language_model"):
+                prefix = prefix.split(".", 1)[-1]
         if isinstance(layer, LinearBase):
             if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
                 # Delayed import to avoid circular import