Support glm4.1v and glm4.5v (#8798)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu>
2025-08-09 00:59:13 -07:00
parent faa25df1ae
commit f29aba8c6e
21 changed files with 1584 additions and 19 deletions
--- a/test/srt/openai_server/function_call/test_openai_function_calling.py
+++ b/test/srt/openai_server/function_call/test_openai_function_calling.py
@@ -948,5 +948,6 @@ class TestOpenAIPythonicFunctionCalling(CustomTestCase):
 #     def test_function_calling_multiturn(self):
 #         self._test_function_calling_multiturn()

+
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_function_call_parser.py
+++ b/test/srt/test_function_call_parser.py
@@ -497,6 +497,17 @@ class TestEBNFGeneration(unittest.TestCase):
                    },
                ),
            ),
+            Tool(
+                type="function",
+                function=Function(
+                    name="empty_param_func",
+                    description="Function with empty parameters",
+                    parameters={
+                        "properties": {},
+                        "required": [],
+                    },
+                ),
+            ),
        ]

        self.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
@@ -630,16 +641,21 @@ class TestEBNFGeneration(unittest.TestCase):
        self.assertIsNotNone(ebnf)
        # Check that the EBNF contains expected patterns for XML format
        self.assertIn('"<tool_call>" function_call "</tool_call>"', ebnf)
-        self.assertIn('"get_weather" "\\n" arguments_get_weather', ebnf)
+        self.assertIn('"get_weather" "\\n" ( arguments_get_weather "\\n" )?', ebnf)
        self.assertIn(
            '"<arg_key>location</arg_key>" "\\n" "<arg_value>" xml_text "</arg_value>" ( "\\n" ( "<arg_key>unit</arg_key>" "\\n" "<arg_value>" ("celsius" | "fahrenheit") "</arg_value>" ) )?',
            ebnf,
        )
-        self.assertIn('"search" "\\n" arguments_search', ebnf)
+        self.assertIn('"search" "\\n" ( arguments_search "\\n" )?', ebnf)
        self.assertIn(
            '"<arg_key>query</arg_key>" "\\n" "<arg_value>" xml_text "</arg_value>"',
            ebnf,
        )
+        self.assertIn(
+            '"empty_param_func" "\\n" ( arguments_empty_param_func "\\n" )?', ebnf
+        )
+        self.assertIn('arguments_empty_param_func ::= ""', ebnf)
+
        # Validate that the EBNF can be compiled by GrammarCompiler
        try:
            ctx = self.grammar_compiler.compile_grammar(ebnf)
--- a/test/srt/test_jinja_template_utils.py
+++ b/test/srt/test_jinja_template_utils.py
@@ -60,6 +60,86 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        result = detect_jinja_template_content_format("")
        self.assertEqual(result, "string")

+    def test_detect_msg_content_pattern(self):
+        """Test detection of template with msg.content pattern (should be 'openai' format)."""
+        msg_content_pattern = """
+[gMASK]<sop>
+{%- for msg in messages %}
+    {%- if msg.role == 'system' %}
+<|system|>
+{{ msg.content }}
+    {%- elif msg.role == 'user' %}
+<|user|>{{ '\n' }}
+        {%- if msg.content is string %}
+{{ msg.content }}
+        {%- else %}
+            {%- for item in msg.content %}
+                {%- if item.type == 'video' or 'video' in item %}
+<|begin_of_video|><|video|><|end_of_video|>
+                {%- elif item.type == 'image' or 'image' in item %}
+<|begin_of_image|><|image|><|end_of_image|>
+                {%- elif item.type == 'text' %}
+{{ item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- elif msg.role == 'assistant' %}
+        {%- if msg.metadata %}
+<|assistant|>{{ msg.metadata }}
+{{ msg.content }}
+        {%- else %}
+<|assistant|>
+{{ msg.content }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{% if add_generation_prompt %}<|assistant|>
+{% endif %}
+        """
+
+        result = detect_jinja_template_content_format(msg_content_pattern)
+        self.assertEqual(result, "openai")
+
+    def test_detect_m_content_pattern(self):
+        """Test detection of template with m.content pattern (should be 'openai' format)."""
+        msg_content_pattern = """
+[gMASK]<sop>
+{%- for m in messages %}
+    {%- if m.role == 'system' %}
+<|system|>
+{{ m.content }}
+    {%- elif m.role == 'user' %}
+<|user|>{{ '\n' }}
+        {%- if m.content is string %}
+{{ m.content }}
+        {%- else %}
+            {%- for item in m.content %}
+                {%- if item.type == 'video' or 'video' in item %}
+<|begin_of_video|><|video|><|end_of_video|>
+                {%- elif item.type == 'image' or 'image' in item %}
+<|begin_of_image|><|image|><|end_of_image|>
+                {%- elif item.type == 'text' %}
+{{ item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- elif m.role == 'assistant' %}
+        {%- if m.metadata %}
+<|assistant|>{{ m.metadata }}
+{{ m.content }}
+        {%- else %}
+<|assistant|>
+{{ m.content }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{% if add_generation_prompt %}<|assistant|>
+{% endif %}
+        """
+
+        result = detect_jinja_template_content_format(msg_content_pattern)
+        self.assertEqual(result, "openai")
+
    def test_process_content_openai_format(self):
        """Test content processing for openai format."""
        msg_dict = {
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -348,6 +348,33 @@ class TestVILAServer(TestOpenAIVisionServer):
        cls.base_url += "/v1"


+# Skip for ci test
+# class TestGLM41VServer(TestOpenAIVisionServer):
+#     @classmethod
+#     def setUpClass(cls):
+#         cls.model = "zai-org/GLM-4.1V-9B-Thinking"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             other_args=[
+#                 "--trust-remote-code",
+#                 "--mem-fraction-static",
+#                 "0.68",
+#                 "--cuda-graph-max-bs",
+#                 "4",
+#                 "--reasoning-parser",
+#                 "glm45",
+#             ],
+#         )
+#         cls.base_url += "/v1"
+
+#     def test_video_chat_completion(self):
+#         self._test_video_chat_completion()
+
+
 if __name__ == "__main__":
    del TestOpenAIVisionServer
    unittest.main()
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -96,8 +96,13 @@ class TestOpenAIVisionServer(CustomTestCase):
        ), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
        # MiniCPMO fails to recognize `iron`, but `hanging`
        assert (
-            "iron" in text or "hang" in text or "cloth" in text or "holding" in text
-        ), f"text: {text}, should contain iron, hang, cloth or holding"
+            "iron" in text
+            or "hang" in text
+            or "cloth" in text
+            or "coat" in text
+            or "holding" in text
+            or "outfit" in text
+        ), f"text: {text}, should contain iron, hang, cloth, coat or holding or outfit"
        assert response.id
        assert response.created
        assert response.usage.prompt_tokens > 0
@@ -193,11 +198,15 @@ class TestOpenAIVisionServer(CustomTestCase):
        print(f"Multi images response:\n{text}")
        print("-" * 30)
        assert (
-            "man" in text or "cab" in text or "SUV" in text or "taxi" in text
-        ), f"text: {text}, should contain man, cab, SUV or taxi"
+            "man" in text
+            or "cab" in text
+            or "SUV" in text
+            or "taxi" in text
+            or "car" in text
+        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
        assert (
-            "logo" in text or '"S"' in text or "SG" in text
-        ), f"text: {text}, should contain logo, S or SG"
+            "logo" in text or '"S"' in text or "SG" in text or "graphic" in text
+        ), f"text: {text}, should contain logo, S or SG or graphic"
        assert response.id
        assert response.created
        assert response.usage.prompt_tokens > 0
@@ -320,11 +329,12 @@ class TestOpenAIVisionServer(CustomTestCase):
            or "individual" in video_response
            or "speaker" in video_response
            or "Steve" in video_response
+            or "hand" in video_response
        ), f"""
        ====================== video_response =====================
        {video_response}
        ===========================================================
-        should contain 'man' or 'person' or 'individual' or 'speaker'
+        should contain 'man' or 'person' or 'individual' or 'speaker' or 'hand'
        """
        assert (
            "present" in video_response
@@ -375,7 +385,8 @@ class TestOpenAIVisionServer(CustomTestCase):
            or "person" in video_response
            or "individual" in video_response
            or "speaker" in video_response
-        ), f"video_response: {video_response}, should either have 'man' in video_response, or 'person' in video_response, or 'individual' in video_response or 'speaker' in video_response"
+            or "hand" in video_response
+        ), f"video_response: {video_response}, should either have 'man' in video_response, or 'person' in video_response, or 'individual' in video_response, or 'speaker' in video_response or 'hand' in video_response"
        assert (
            "present" in video_response
            or "examine" in video_response