[Bugfix] Support Kimi-K2.5 models (#6755)
### What this PR does / why we need it?
This PR supports the Kimi-K2.5 models on the NPU of bf16 and w4a8
weights.
The corresponding PR in the vllm community has been merged:
https://github.com/vllm-project/vllm/pull/34501
### Does this PR introduce _any_ user-facing change?
- No.
### How was this patch tested?
We test the Kimi-K2.5 weights. The weights path:
https://modelscope.cn/models/Eco-Tech/Kimi-K2.5-W4A8
Successfully ran on 910B NPU using vllm-ascend by the w4a8 weights.
- vLLM version: v0.15.0
- vLLM main:
9562912cea
---------
Signed-off-by: LoganJane <LoganJane73@hotmail.com>
This commit is contained in:
@@ -35,3 +35,4 @@ import vllm_ascend.patch.worker.patch_v2_eagle # noqa
|
|||||||
import vllm_ascend.patch.worker.patch_v2_uva # noqa
|
import vllm_ascend.patch.worker.patch_v2_uva # noqa
|
||||||
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa
|
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa
|
||||||
import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa
|
import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa
|
||||||
|
import vllm_ascend.patch.worker.patch_kimi_k25 # noqa
|
||||||
|
|||||||
71
vllm_ascend/patch/worker/patch_kimi_k25.py
Normal file
71
vllm_ascend/patch/worker/patch_kimi_k25.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
#
|
||||||
|
# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||||
|
# This file is a part of the vllm-ascend project.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.model_executor.models.kimi_k25_vit import Learnable2DInterpPosEmbDivided_fixed, get_rope_shape_decorate
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@get_rope_shape_decorate
|
||||||
|
def get_rope_shape(org, interpolation_mode, shape):
|
||||||
|
return (
|
||||||
|
F.interpolate(
|
||||||
|
org.permute((2, 0, 1)).unsqueeze(0),
|
||||||
|
size=shape,
|
||||||
|
mode=interpolation_mode,
|
||||||
|
)
|
||||||
|
.squeeze(0)
|
||||||
|
.permute((1, 2, 0))
|
||||||
|
.flatten(end_dim=1)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AscendLearnable2DInterpPosEmbDivided_fixed(nn.Module):
|
||||||
|
def forward(self, x: torch.Tensor, grid_thws: torch.Tensor) -> torch.Tensor:
|
||||||
|
pos_embs = []
|
||||||
|
for t, h, w in grid_thws.tolist():
|
||||||
|
x_device = x.device
|
||||||
|
x_dtype = x.dtype
|
||||||
|
assert t <= self.num_frames, f"t:{t} > self.num_frames:{self.num_frames}"
|
||||||
|
if (h, w) == self.weight.shape[:-1]:
|
||||||
|
pos_emb_2d = self.weight.flatten(end_dim=1)
|
||||||
|
else:
|
||||||
|
weight_fp32 = self.weight.to(dtype=torch.float32)
|
||||||
|
weight_cpu = weight_fp32.to("cpu")
|
||||||
|
pos_emb_2d = get_rope_shape(
|
||||||
|
weight_cpu,
|
||||||
|
interpolation_mode=self.interpolation_mode,
|
||||||
|
shape=(h, w),
|
||||||
|
)
|
||||||
|
pos_emb_2d = pos_emb_2d.to(x_device, dtype=x_dtype)
|
||||||
|
|
||||||
|
if t == 1:
|
||||||
|
pos_emb_3d = pos_emb_2d
|
||||||
|
else:
|
||||||
|
pos_emb_3d = pos_emb_2d.unsqueeze(0).repeat(t, 1, 1) + self.time_weight[0:t]
|
||||||
|
|
||||||
|
pos_embs.append(pos_emb_3d.reshape(-1, pos_emb_3d.shape[-1]))
|
||||||
|
|
||||||
|
out = x + torch.cat(pos_embs)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
Learnable2DInterpPosEmbDivided_fixed.forward = AscendLearnable2DInterpPosEmbDivided_fixed.forward
|
||||||
@@ -54,6 +54,10 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
|
|||||||
"language_model.lm_head.": "lm_head.",
|
"language_model.lm_head.": "lm_head.",
|
||||||
"language_model.model.": "model.language_model.",
|
"language_model.model.": "model.language_model.",
|
||||||
},
|
},
|
||||||
|
"kimi_k25": {
|
||||||
|
"mm_projector.linear_1": "mm_projector.proj.0",
|
||||||
|
"mm_projector.linear_2": "mm_projector.proj.2",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# key: model_type
|
# key: model_type
|
||||||
@@ -393,6 +397,7 @@ class AscendModelSlimConfig(QuantizationConfig):
|
|||||||
else:
|
else:
|
||||||
from vllm.model_executor.layers.attention import Attention
|
from vllm.model_executor.layers.attention import Attention
|
||||||
|
|
||||||
|
if model_type != "kimi_k2":
|
||||||
if prefix.startswith("language_model"):
|
if prefix.startswith("language_model"):
|
||||||
prefix = prefix.split(".", 1)[-1]
|
prefix = prefix.split(".", 1)[-1]
|
||||||
if isinstance(layer, LinearBase):
|
if isinstance(layer, LinearBase):
|
||||||
|
|||||||
Reference in New Issue
Block a user