Add CI for gpt-oss model on hopper (#8851)

This commit is contained in:
fzyzcjy
2025-08-09 15:34:23 +08:00
committed by GitHub
parent de8b8b6e5c
commit 442534aa44
7 changed files with 187 additions and 2 deletions

View File

@@ -65,9 +65,10 @@ def run_eval(args):
sampler = ChatCompletionSampler(
model=args.model,
max_tokens=2048,
max_tokens=getattr(args, "max_tokens", 2048),
base_url=base_url,
temperature=getattr(args, "temperature", 0.0),
reasoning_effort=getattr(args, "reasoning_effort", None),
)
# Run eval
@@ -120,7 +121,9 @@ if __name__ == "__main__":
parser.add_argument("--eval-name", type=str, default="mmlu")
parser.add_argument("--num-examples", type=int)
parser.add_argument("--num-threads", type=int, default=512)
parser.add_argument("--max-tokens", type=int, default=2048)
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--reasoning-effort", type=str)
args = parser.parse_args()
run_eval(args)

View File

@@ -91,6 +91,7 @@ class ChatCompletionSampler(SamplerBase):
model: Optional[str] = None,
system_message: Optional[str] = None,
temperature: float = 0.0,
reasoning_effort: Optional[str] = None,
max_tokens: int = 2048,
):
self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient())
@@ -102,7 +103,11 @@ class ChatCompletionSampler(SamplerBase):
self.system_message = system_message
self.temperature = temperature
self.max_tokens = max_tokens
self.reasoning_effort = reasoning_effort
self.image_format = "url"
print(
f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=}"
)
def _handle_image(
self,
@@ -138,6 +143,7 @@ class ChatCompletionSampler(SamplerBase):
messages=message_list,
temperature=self.temperature,
max_tokens=self.max_tokens,
reasoning_effort=self.reasoning_effort,
)
return response.choices[0].message.content
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU

View File

@@ -71,6 +71,8 @@ class GPQAEval(Eval):
)
]
response_text = sampler(prompt_messages)
if response_text is None:
response_text = ""
match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
extracted_answer = match.group(1) if match else None
score = 1.0 if extracted_answer == correct_answer else 0.0