Support glm4.1v and glm4.5v (#8798)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu>
2025-08-09 00:59:13 -07:00
parent faa25df1ae
commit f29aba8c6e
21 changed files with 1584 additions and 19 deletions
--- a/benchmark/mmmu/README.md
+++ b/benchmark/mmmu/README.md
@@ -27,6 +27,18 @@ python -m sglang.launch_server --model-path microsoft/Phi-4-multimodal-instruct
 python -m benchmark/mmmu/bench_sglang.py --concurrency 8 --lora-path vision
 ```

+You can use `--response-answer-regex` to specify how to extract the answer from the response string. E.g.,
+```
+python3 -m sglang.launch_server --model-path zai-org/GLM-4.1V-9B-Thinking --reasoning-parser glm45
+
+python3 bench_sglang.py --response-answer-regex "<\|begin_of_box\|>(.*)<\|end_of_box\|>" --concurrency 64
+```
+
+You can use `--extra-request-body` to specify additional OpenAI request parameters. E.g.,
+```
+python3 bench_sglang.py --extra-request-body '{"max_new_tokens": 128, "temperature": 0.01}'
+```
+
 ### Evaluate hf

 ```
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -11,6 +11,7 @@ The eval output will be logged

 import argparse
 import asyncio
+import re
 import sys
 import time
 import traceback
@@ -145,7 +146,17 @@ async def eval_mmmu(args) -> None:
            _, response = await process_sample(
                client, sample, sampling_params, lora_path
            )
-            process_result(response, sample, answer_dict, out_samples)
+            answer = (
+                re.search(args.response_answer_regex, response)
+                if response is not None
+                else None
+            )
+            process_result(
+                answer.group(1) if answer else response,
+                sample,
+                answer_dict,
+                out_samples,
+            )
    else:
        semaphore = asyncio.Semaphore(args.concurrency)
        tasks = [
@@ -157,7 +168,17 @@ async def eval_mmmu(args) -> None:

        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            sample, response = await coro
-            process_result(response, sample, answer_dict, out_samples)
+            answer = (
+                re.search(args.response_answer_regex, response)
+                if response is not None
+                else None
+            )
+            process_result(
+                answer.group(1) if answer else response,
+                sample,
+                answer_dict,
+                out_samples,
+            )

    if args.profile:
        print("Stopping profiler...")
--- a/benchmark/mmmu/eval_utils.py
+++ b/benchmark/mmmu/eval_utils.py
@@ -35,6 +35,7 @@ class EvalArgs:
    profile: bool = False
    profile_number: int = 5
    concurrency: int = 1
+    response_answer_regex: str = "(.*)"
    lora_path: Optional[str] = None

    @staticmethod
@@ -92,6 +93,12 @@ class EvalArgs:
            default=EvalArgs.concurrency,
            help="Number of concurrent requests to make during evaluation. Default is 1, which means no concurrency.",
        )
+        parser.add_argument(
+            "--response-answer-regex",
+            type=str,
+            default=EvalArgs.response_answer_regex,
+            help="Specific regex to capture the answer from the response, string",
+        )
        parser.add_argument(
            "--lora-path",
            type=str,