first commit

2026-03-10 13:31:25 +08:00
parent ba974cecfa
commit b62b889355
2604 changed files with 438977 additions and 0 deletions
--- a/vllm_br/model_executor/models/chatglm.py
+++ b/vllm_br/model_executor/models/chatglm.py
@@ -0,0 +1,195 @@
+################################################################################
+# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+"""Inference-only ChatGLM model compatible with THUDM weights."""
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+import vllm
+from vllm.attention import Attention
+from vllm.config import CacheConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.models.chatglm import GLMMLP
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import ChatGLMConfig
+
+
+def model_forward(
+    self,
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    intermediate_tensors: Optional[IntermediateTensors] = None,
+    inputs_embeds: Optional[torch.Tensor] = None,
+    **kwargs: object,
+) -> Union[torch.Tensor, IntermediateTensors]:
+    if get_pp_group().is_first_rank:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+    else:
+        assert intermediate_tensors is not None
+        hidden_states = intermediate_tensors["hidden_states"]
+    # unsqueeze for RMSNorm op
+    hidden_states = hidden_states.unsqueeze(0)
+    # Run encoder.
+    hidden_states = self.encoder(
+        hidden_states=hidden_states,
+        position_ids=positions,
+    )
+    # suqeeze to 2-d shape
+    return hidden_states.squeeze(0)
+
+
+class GLMAttention_fit(nn.Module):
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.multi_query_attention = config.multi_query_attention
+        self.total_num_kv_heads = (config.multi_query_group_num
+                                   if config.multi_query_attention else
+                                   config.num_attention_heads)
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.add_bias_linear or config.add_qkv_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        rope_ratio = getattr(config, "rope_ratio", 1.0)
+        max_positions = getattr(config, "seq_length", 8192)
+        # NOTE: THUDM/cogagent-9b-20241220 uses original_rope=False,
+        # which is equivalent to is_neox_style=True
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim // 2,
+            max_position=max_positions,
+            base=10000 * rope_ratio,
+            is_neox_style=False,
+            op_type="Chatglm2",
+        )
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        context_layer = self.attn(q, k, v)
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+def GLMMLP__init__(
+    self,
+    config: ChatGLMConfig,
+    quant_config: Optional[QuantizationConfig] = None,
+    prefix: str = "",
+):
+    super(GLMMLP, self).__init__()
+
+    self.add_bias = config.add_bias_linear
+
+    # Project to 4h.
+    self.dense_h_to_4h = MergedColumnParallelLinear(
+        config.hidden_size,
+        [config.ffn_hidden_size] * 2,
+        bias=config.add_bias_linear,
+        quant_config=quant_config,
+        prefix=f"{prefix}.dense_h_to_4h",
+    )
+    self.dense_h_to_4h.no_fuse_act = True
+    self.activation_func = SiluAndMul()
+
+    # Project back to h.
+    self.dense_4h_to_h = RowParallelLinear(
+        config.ffn_hidden_size,
+        config.hidden_size,
+        bias=config.add_bias_linear,
+        quant_config=quant_config,
+        prefix=f"{prefix}.dense_4h_to_h",
+    )
+
+
+def GLMMLP__forward(self, hidden_states):
+    # [s, b, 4hp]
+    intermediate_parallel, _ = self.dense_h_to_4h(hidden_states)
+    # [s, b, h]
+    output, _ = self.dense_4h_to_h(intermediate_parallel)
+    return output
+
+
+vllm.model_executor.models.chatglm.GLMMLP.forward = GLMMLP__forward
+vllm.model_executor.models.chatglm.GLMMLP.__init__ = GLMMLP__init__
+vllm.model_executor.models.chatglm.ChatGLMModel.forward = model_forward
+vllm.model_executor.models.chatglm.GLMAttention = GLMAttention_fit