model: support dots.vlm1 model (#8778)
Co-authored-by: weishi <bushou@xiaohongshu.com> Co-authored-by: Ezra-Yu <1105212286@qq.com> Co-authored-by: Jianfei Wang <905787410@qq.com> Co-authored-by: qianwu <wangjianfei@xiaohongshu.com>
This commit is contained in:
committed by
GitHub
parent
6d40308905
commit
1b1701f1f7
@@ -124,7 +124,9 @@ async def eval_mmmu(args) -> None:
|
||||
answer_dict = {}
|
||||
out_samples = {}
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
|
||||
api_key="sk",
|
||||
base_url=f"http://127.0.0.1:{args.port}/v1",
|
||||
timeout=20 * 60 * 60,
|
||||
)
|
||||
start = time.perf_counter()
|
||||
base_url = f"http://127.0.0.1:{args.port}"
|
||||
@@ -146,13 +148,14 @@ async def eval_mmmu(args) -> None:
|
||||
_, response = await process_sample(
|
||||
client, sample, sampling_params, lora_path
|
||||
)
|
||||
sample["original_response"] = response
|
||||
answer = (
|
||||
re.search(args.response_answer_regex, response)
|
||||
if response is not None
|
||||
else None
|
||||
)
|
||||
process_result(
|
||||
answer.group(1) if answer else response,
|
||||
answer.group(1).strip() if answer else response,
|
||||
sample,
|
||||
answer_dict,
|
||||
out_samples,
|
||||
@@ -168,13 +171,14 @@ async def eval_mmmu(args) -> None:
|
||||
|
||||
for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
|
||||
sample, response = await coro
|
||||
sample["original_response"] = response
|
||||
answer = (
|
||||
re.search(args.response_answer_regex, response)
|
||||
if response is not None
|
||||
else None
|
||||
)
|
||||
process_result(
|
||||
answer.group(1) if answer else response,
|
||||
answer.group(1).strip() if answer else response,
|
||||
sample,
|
||||
answer_dict,
|
||||
out_samples,
|
||||
|
||||
@@ -18,6 +18,7 @@ from data_utils import (
|
||||
construct_prompt,
|
||||
load_yaml,
|
||||
process_single_sample,
|
||||
save_json,
|
||||
)
|
||||
from datasets import concatenate_datasets, load_dataset
|
||||
from tqdm import tqdm
|
||||
@@ -28,7 +29,7 @@ class EvalArgs:
|
||||
seed: int = 42
|
||||
split: str = "validation"
|
||||
image_pixels_limit: int = -1
|
||||
result_filename: str = ""
|
||||
result_filename: str = f"./val_sglang.json"
|
||||
prompt_format_file: str = "prompt_format.yaml"
|
||||
dataset_path: str = "MMMU/MMMU"
|
||||
extra_request_body: Optional[str] = None
|
||||
@@ -445,6 +446,18 @@ def eval_multi_choice(gold_i, pred_i):
|
||||
Evaluate a multiple choice instance.
|
||||
"""
|
||||
correct = False
|
||||
# for case like Answer: A, Answer is A, answer is A, answer: A
|
||||
for _exp in ["Answer:", "Answer is ", "answer is ", "answer: "]:
|
||||
if _exp in pred_i:
|
||||
pred_i = pred_i.split(_exp)[1].strip()
|
||||
break
|
||||
# for case like (A), (B), (C), (D) ......
|
||||
if "(" in pred_i and ")" in pred_i:
|
||||
try:
|
||||
pred_i = re.search(r"\(([A-Z])\)", pred_i).group(1)
|
||||
except:
|
||||
print(f"Error to extract answer from: {pred_i}")
|
||||
pass
|
||||
# only they are exactly the same, we consider it as correct
|
||||
if isinstance(gold_i, list):
|
||||
for answer in gold_i:
|
||||
@@ -535,7 +548,12 @@ def process_result(response, sample, answer_dict, out_samples):
|
||||
else: # open question
|
||||
pred_ans = response
|
||||
|
||||
out_samples[sample["id"]] = pred_ans
|
||||
out_samples[sample["id"]] = {
|
||||
"pred_ans": pred_ans,
|
||||
"original_response": sample["original_response"],
|
||||
"ground_truth": sample["answer"],
|
||||
"question_type": sample["question_type"],
|
||||
}
|
||||
|
||||
# set ground truth answer
|
||||
answer_dict[sample["id"]] = {
|
||||
@@ -554,6 +572,12 @@ def eval_result(model_answer_path, answer_dict, eval_output_path=None):
|
||||
# group by category
|
||||
output_dict_w_cat = {}
|
||||
for data_id, parsed_pred in output_dict.items():
|
||||
if isinstance(parsed_pred, str):
|
||||
parsed_pred = parsed_pred
|
||||
elif isinstance(parsed_pred, dict):
|
||||
parsed_pred = parsed_pred["pred_ans"]
|
||||
else:
|
||||
raise ValueError(f"Unknown type of parsed_pred: {type(parsed_pred)}")
|
||||
category = "_".join(data_id.split("_")[1:-1])
|
||||
if category not in output_dict_w_cat:
|
||||
output_dict_w_cat.update({category: {}})
|
||||
@@ -600,9 +624,12 @@ def eval_result(model_answer_path, answer_dict, eval_output_path=None):
|
||||
|
||||
judge_dict, metric_dict = evaluate(exampels_to_eval)
|
||||
metric_dict.update({"num_example": len(exampels_to_eval)})
|
||||
for key, value in judge_dict.items():
|
||||
output_dict[key]["judge"] = value
|
||||
|
||||
evaluation_result[category] = metric_dict
|
||||
|
||||
save_json(model_answer_path, output_dict)
|
||||
printable_results = {}
|
||||
# pdb.set_trace()
|
||||
# add domain Subject
|
||||
|
||||
Reference in New Issue
Block a user