Add GLM-4 TextGeneration Model support for SGLang (#1736)

2024-10-21 12:08:30 +08:00
parent b121bc03a3
commit 45d5af2416
3 changed files with 5 additions and 3 deletions
--- a/README.md
+++ b/README.md
@@ -303,6 +303,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
 - MiniCPM / MiniCPM 3
 - XVERSE / XVERSE MoE
 - SmolLM
 - GLM-4
 **Embedding Models**
--- a/python/sglang/srt/models/chatglm.py
+++ b/python/sglang/srt/models/chatglm.py
@@ -303,7 +303,7 @@ class GLMTransformer(nn.Module):
        return hidden_states
-class ChatGLMModel(nn.Module):
+class ChatGLMM(nn.Module):
    def __init__(
        self,
        config,
@@ -366,7 +366,7 @@ class ChatGLMForCausalLM(nn.Module):
        self.config: ChatGLMConfig = config
        self.quant_config = quant_config
        self.max_position_embeddings = getattr(config, "max_sequence_length", 8192)
-        self.transformer = ChatGLMModel(config, cache_config, quant_config)
+        self.transformer = ChatGLMM(config, cache_config, quant_config)
        self.lm_head = self.transformer.output_layer
        self.logits_processor = LogitsProcessor(config)
@@ -401,4 +401,4 @@ class ChatGLMModel(ChatGLMForCausalLM):
    pass
-EntryClass = [ChatGLMForCausalLM, ChatGLMModel]
+EntryClass = [ChatGLMModel]
--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -57,6 +57,7 @@ ALL_OTHER_MODELS = [
    ModelCase("Qwen/Qwen2.5-14B-Instruct"),
    ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
    ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
    ModelCase("THUDM/glm-4-9b-chat"),
 ]
 TORCH_DTYPES = [torch.float16]