Support glm4.1v and glm4.5v (#8798)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: Xinyuan Tong <justinning0323@outlook.com>
Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com>
Co-authored-by: Chang Su <csu272@usc.edu>
This commit is contained in:
Binyao Jiang
2025-08-09 00:59:13 -07:00
committed by GitHub
parent faa25df1ae
commit f29aba8c6e
21 changed files with 1584 additions and 19 deletions

View File

@@ -27,6 +27,18 @@ python -m sglang.launch_server --model-path microsoft/Phi-4-multimodal-instruct
python -m benchmark/mmmu/bench_sglang.py --concurrency 8 --lora-path vision
```
You can use `--response-answer-regex` to specify how to extract the answer from the response string. E.g.,
```
python3 -m sglang.launch_server --model-path zai-org/GLM-4.1V-9B-Thinking --reasoning-parser glm45
python3 bench_sglang.py --response-answer-regex "<\|begin_of_box\|>(.*)<\|end_of_box\|>" --concurrency 64
```
You can use `--extra-request-body` to specify additional OpenAI request parameters. E.g.,
```
python3 bench_sglang.py --extra-request-body '{"max_new_tokens": 128, "temperature": 0.01}'
```
### Evaluate hf
```

View File

@@ -11,6 +11,7 @@ The eval output will be logged
import argparse
import asyncio
import re
import sys
import time
import traceback
@@ -145,7 +146,17 @@ async def eval_mmmu(args) -> None:
_, response = await process_sample(
client, sample, sampling_params, lora_path
)
process_result(response, sample, answer_dict, out_samples)
answer = (
re.search(args.response_answer_regex, response)
if response is not None
else None
)
process_result(
answer.group(1) if answer else response,
sample,
answer_dict,
out_samples,
)
else:
semaphore = asyncio.Semaphore(args.concurrency)
tasks = [
@@ -157,7 +168,17 @@ async def eval_mmmu(args) -> None:
for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
sample, response = await coro
process_result(response, sample, answer_dict, out_samples)
answer = (
re.search(args.response_answer_regex, response)
if response is not None
else None
)
process_result(
answer.group(1) if answer else response,
sample,
answer_dict,
out_samples,
)
if args.profile:
print("Stopping profiler...")

View File

@@ -35,6 +35,7 @@ class EvalArgs:
profile: bool = False
profile_number: int = 5
concurrency: int = 1
response_answer_regex: str = "(.*)"
lora_path: Optional[str] = None
@staticmethod
@@ -92,6 +93,12 @@ class EvalArgs:
default=EvalArgs.concurrency,
help="Number of concurrent requests to make during evaluation. Default is 1, which means no concurrency.",
)
parser.add_argument(
"--response-answer-regex",
type=str,
default=EvalArgs.response_answer_regex,
help="Specific regex to capture the answer from the response, string",
)
parser.add_argument(
"--lora-path",
type=str,