model: support dots.vlm1 model (#8778)

Co-authored-by: weishi <bushou@xiaohongshu.com>
Co-authored-by: Ezra-Yu <1105212286@qq.com>
Co-authored-by: Jianfei Wang <905787410@qq.com>
Co-authored-by: qianwu <wangjianfei@xiaohongshu.com>
This commit is contained in:
chenge@xiaohongshu.com
2025-09-12 17:38:38 +08:00
committed by GitHub
parent 6d40308905
commit 1b1701f1f7
11 changed files with 806 additions and 11 deletions

View File

@@ -124,7 +124,9 @@ async def eval_mmmu(args) -> None:
answer_dict = {}
out_samples = {}
client = openai.AsyncOpenAI(
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
api_key="sk",
base_url=f"http://127.0.0.1:{args.port}/v1",
timeout=20 * 60 * 60,
)
start = time.perf_counter()
base_url = f"http://127.0.0.1:{args.port}"
@@ -146,13 +148,14 @@ async def eval_mmmu(args) -> None:
_, response = await process_sample(
client, sample, sampling_params, lora_path
)
sample["original_response"] = response
answer = (
re.search(args.response_answer_regex, response)
if response is not None
else None
)
process_result(
answer.group(1) if answer else response,
answer.group(1).strip() if answer else response,
sample,
answer_dict,
out_samples,
@@ -168,13 +171,14 @@ async def eval_mmmu(args) -> None:
for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
sample, response = await coro
sample["original_response"] = response
answer = (
re.search(args.response_answer_regex, response)
if response is not None
else None
)
process_result(
answer.group(1) if answer else response,
answer.group(1).strip() if answer else response,
sample,
answer_dict,
out_samples,

View File

@@ -18,6 +18,7 @@ from data_utils import (
construct_prompt,
load_yaml,
process_single_sample,
save_json,
)
from datasets import concatenate_datasets, load_dataset
from tqdm import tqdm
@@ -28,7 +29,7 @@ class EvalArgs:
seed: int = 42
split: str = "validation"
image_pixels_limit: int = -1
result_filename: str = ""
result_filename: str = f"./val_sglang.json"
prompt_format_file: str = "prompt_format.yaml"
dataset_path: str = "MMMU/MMMU"
extra_request_body: Optional[str] = None
@@ -445,6 +446,18 @@ def eval_multi_choice(gold_i, pred_i):
Evaluate a multiple choice instance.
"""
correct = False
# for case like Answer: A, Answer is A, answer is A, answer: A
for _exp in ["Answer:", "Answer is ", "answer is ", "answer: "]:
if _exp in pred_i:
pred_i = pred_i.split(_exp)[1].strip()
break
# for case like (A), (B), (C), (D) ......
if "(" in pred_i and ")" in pred_i:
try:
pred_i = re.search(r"\(([A-Z])\)", pred_i).group(1)
except:
print(f"Error to extract answer from: {pred_i}")
pass
# only they are exactly the same, we consider it as correct
if isinstance(gold_i, list):
for answer in gold_i:
@@ -535,7 +548,12 @@ def process_result(response, sample, answer_dict, out_samples):
else: # open question
pred_ans = response
out_samples[sample["id"]] = pred_ans
out_samples[sample["id"]] = {
"pred_ans": pred_ans,
"original_response": sample["original_response"],
"ground_truth": sample["answer"],
"question_type": sample["question_type"],
}
# set ground truth answer
answer_dict[sample["id"]] = {
@@ -554,6 +572,12 @@ def eval_result(model_answer_path, answer_dict, eval_output_path=None):
# group by category
output_dict_w_cat = {}
for data_id, parsed_pred in output_dict.items():
if isinstance(parsed_pred, str):
parsed_pred = parsed_pred
elif isinstance(parsed_pred, dict):
parsed_pred = parsed_pred["pred_ans"]
else:
raise ValueError(f"Unknown type of parsed_pred: {type(parsed_pred)}")
category = "_".join(data_id.split("_")[1:-1])
if category not in output_dict_w_cat:
output_dict_w_cat.update({category: {}})
@@ -600,9 +624,12 @@ def eval_result(model_answer_path, answer_dict, eval_output_path=None):
judge_dict, metric_dict = evaluate(exampels_to_eval)
metric_dict.update({"num_example": len(exampels_to_eval)})
for key, value in judge_dict.items():
output_dict[key]["judge"] = value
evaluation_result[category] = metric_dict
save_json(model_answer_path, output_dict)
printable_results = {}
# pdb.set_trace()
# add domain Subject