model: support dots.vlm1 model (#8778)

Co-authored-by: weishi <bushou@xiaohongshu.com> Co-authored-by: Ezra-Yu <1105212286@qq.com> Co-authored-by: Jianfei Wang <905787410@qq.com> Co-authored-by: qianwu <wangjianfei@xiaohongshu.com>
2025-09-12 17:38:38 +08:00
parent 6d40308905
commit 1b1701f1f7
11 changed files with 806 additions and 11 deletions
--- a/benchmark/mmmu/bench_sglang.py
+++ b/benchmark/mmmu/bench_sglang.py
@@ -124,7 +124,9 @@ async def eval_mmmu(args) -> None:
    answer_dict = {}
    out_samples = {}
    client = openai.AsyncOpenAI(
-        api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
+        api_key="sk",
+        base_url=f"http://127.0.0.1:{args.port}/v1",
+        timeout=20 * 60 * 60,
    )
    start = time.perf_counter()
    base_url = f"http://127.0.0.1:{args.port}"
@@ -146,13 +148,14 @@ async def eval_mmmu(args) -> None:
            _, response = await process_sample(
                client, sample, sampling_params, lora_path
            )
+            sample["original_response"] = response
            answer = (
                re.search(args.response_answer_regex, response)
                if response is not None
                else None
            )
            process_result(
-                answer.group(1) if answer else response,
+                answer.group(1).strip() if answer else response,
                sample,
                answer_dict,
                out_samples,
@@ -168,13 +171,14 @@ async def eval_mmmu(args) -> None:

        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            sample, response = await coro
+            sample["original_response"] = response
            answer = (
                re.search(args.response_answer_regex, response)
                if response is not None
                else None
            )
            process_result(
-                answer.group(1) if answer else response,
+                answer.group(1).strip() if answer else response,
                sample,
                answer_dict,
                out_samples,
--- a/benchmark/mmmu/eval_utils.py
+++ b/benchmark/mmmu/eval_utils.py
@@ -18,6 +18,7 @@ from data_utils import (
    construct_prompt,
    load_yaml,
    process_single_sample,
+    save_json,
 )
 from datasets import concatenate_datasets, load_dataset
 from tqdm import tqdm
@@ -28,7 +29,7 @@ class EvalArgs:
    seed: int = 42
    split: str = "validation"
    image_pixels_limit: int = -1
-    result_filename: str = ""
+    result_filename: str = f"./val_sglang.json"
    prompt_format_file: str = "prompt_format.yaml"
    dataset_path: str = "MMMU/MMMU"
    extra_request_body: Optional[str] = None
@@ -445,6 +446,18 @@ def eval_multi_choice(gold_i, pred_i):
    Evaluate a multiple choice instance.
    """
    correct = False
+    # for case like Answer: A, Answer is A, answer is A, answer: A
+    for _exp in ["Answer:", "Answer is ", "answer is ", "answer: "]:
+        if _exp in pred_i:
+            pred_i = pred_i.split(_exp)[1].strip()
+            break
+    # for case like (A), (B), (C), (D) ......
+    if "(" in pred_i and ")" in pred_i:
+        try:
+            pred_i = re.search(r"\(([A-Z])\)", pred_i).group(1)
+        except:
+            print(f"Error to extract answer from: {pred_i}")
+            pass
    # only they are exactly the same, we consider it as correct
    if isinstance(gold_i, list):
        for answer in gold_i:
@@ -535,7 +548,12 @@ def process_result(response, sample, answer_dict, out_samples):
    else:  # open question
        pred_ans = response

-    out_samples[sample["id"]] = pred_ans
+    out_samples[sample["id"]] = {
+        "pred_ans": pred_ans,
+        "original_response": sample["original_response"],
+        "ground_truth": sample["answer"],
+        "question_type": sample["question_type"],
+    }

    # set ground truth answer
    answer_dict[sample["id"]] = {
@@ -554,6 +572,12 @@ def eval_result(model_answer_path, answer_dict, eval_output_path=None):
    # group by category
    output_dict_w_cat = {}
    for data_id, parsed_pred in output_dict.items():
+        if isinstance(parsed_pred, str):
+            parsed_pred = parsed_pred
+        elif isinstance(parsed_pred, dict):
+            parsed_pred = parsed_pred["pred_ans"]
+        else:
+            raise ValueError(f"Unknown type of parsed_pred: {type(parsed_pred)}")
        category = "_".join(data_id.split("_")[1:-1])
        if category not in output_dict_w_cat:
            output_dict_w_cat.update({category: {}})
@@ -600,9 +624,12 @@ def eval_result(model_answer_path, answer_dict, eval_output_path=None):

        judge_dict, metric_dict = evaluate(exampels_to_eval)
        metric_dict.update({"num_example": len(exampels_to_eval)})
+        for key, value in judge_dict.items():
+            output_dict[key]["judge"] = value

        evaluation_result[category] = metric_dict

+    save_json(model_answer_path, output_dict)
    printable_results = {}
    # pdb.set_trace()
    # add domain Subject