Files
sglang/test/srt/test_gpt_oss_common.py

112 lines
3.8 KiB
Python
Raw Permalink Normal View History

import os
from concurrent.futures import ThreadPoolExecutor
from types import SimpleNamespace
from typing import Dict, List, Literal, Optional
from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
2025-08-09 16:00:10 -07:00
is_in_ci,
popen_launch_server,
2025-08-09 16:00:10 -07:00
write_github_step_summary,
)
_base_url = DEFAULT_URL_FOR_TEST
_is_hip = is_hip()
class BaseTestGptOss(CustomTestCase):
def run_test(
self,
model_variant: Literal["20b", "120b"],
quantization: Literal["mxfp4", "bf16"],
expected_score_of_reasoning_effort: Dict[str, float],
other_args: Optional[List[str]] = None,
):
if other_args is None:
other_args = []
model = {
("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
("20b", "mxfp4"): "openai/gpt-oss-20b",
("120b", "mxfp4"): "openai/gpt-oss-120b",
}[(model_variant, quantization)]
if model_variant == "20b":
other_args += ["--cuda-graph-max-bs", "600"]
if _is_hip:
os.environ["SGLANG_USE_AITER"] = "0"
self._run_test_raw(
model=model,
expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
other_args=other_args,
)
def _run_test_raw(
self,
model: str,
expected_score_of_reasoning_effort: Dict[str, float],
other_args: List[str],
):
process = popen_launch_server(
model,
_base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
try:
# run multiple tests in parallel since we are mostly bound by the longest generate sequence
# instead of the number of questions
with ThreadPoolExecutor(max_workers=4) as executor:
list(
executor.map(
lambda d: self._run_one_eval(**d),
[
dict(
model=model,
reasoning_effort=reasoning_effort,
expected_score=expected_score,
)
for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
],
)
)
finally:
kill_process_tree(process.pid)
def _run_one_eval(self, model, reasoning_effort, expected_score):
args = SimpleNamespace(
base_url=_base_url,
model=model,
eval_name="gpqa",
num_examples=198,
# use enough threads to allow parallelism
num_threads=198,
# TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
# otherwise a lot of questions are not answered
max_tokens=4096,
# simple-evals by default use 0.5 and is better than 0.0 temperature
# but here for reproducibility, we use 0.1
temperature=0.1,
reasoning_effort=reasoning_effort,
)
2025-08-09 16:00:10 -07:00
setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}"
print(f"Evaluation start: {setup}")
metrics = run_eval(args)
2025-08-09 16:00:10 -07:00
print(f"Evaluation end: {setup} {metrics=}")
self.assertGreaterEqual(metrics["score"], expected_score)
2025-08-09 16:00:10 -07:00
if is_in_ci():
write_github_step_summary(
f"### test_gpt_oss_common\n"
f"Setup: {setup}\n"
f"Score: {metrics['score']:.2f}\n"
)