sglang/test/srt/test_gpt_oss_common.py

import os
from concurrent.futures import ThreadPoolExecutor
from types import SimpleNamespace
from typing import Dict, List, Literal, Optional

from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
    is_in_ci,
    popen_launch_server,
    write_github_step_summary,
)

_base_url = DEFAULT_URL_FOR_TEST
_is_hip = is_hip()


class BaseTestGptOss(CustomTestCase):
    def run_test(
        self,
        model_variant: Literal["20b", "120b"],
        quantization: Literal["mxfp4", "bf16"],
        expected_score_of_reasoning_effort: Dict[str, float],
        other_args: Optional[List[str]] = None,
    ):
        if other_args is None:
            other_args = []

        model = {
            ("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
            ("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
            ("20b", "mxfp4"): "openai/gpt-oss-20b",
            ("120b", "mxfp4"): "openai/gpt-oss-120b",
        }[(model_variant, quantization)]

        if model_variant == "20b":
            other_args += ["--cuda-graph-max-bs", "600"]
        if _is_hip:
            os.environ["SGLANG_USE_AITER"] = "0"
        self._run_test_raw(
            model=model,
            expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
            other_args=other_args,
        )

    def _run_test_raw(
        self,
        model: str,
        expected_score_of_reasoning_effort: Dict[str, float],
        other_args: List[str],
    ):
        process = popen_launch_server(
            model,
            _base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=other_args,
        )

        try:
            # run multiple tests in parallel since we are mostly bound by the longest generate sequence
            # instead of the number of questions
            with ThreadPoolExecutor(max_workers=4) as executor:
                list(
                    executor.map(
                        lambda d: self._run_one_eval(**d),
                        [
                            dict(
                                model=model,
                                reasoning_effort=reasoning_effort,
                                expected_score=expected_score,
                            )
                            for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
                        ],
                    )
                )
        finally:
            kill_process_tree(process.pid)

    def _run_one_eval(self, model, reasoning_effort, expected_score):
        args = SimpleNamespace(
            base_url=_base_url,
            model=model,
            eval_name="gpqa",
            num_examples=198,
            # use enough threads to allow parallelism
            num_threads=198,
            # TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
            # otherwise a lot of questions are not answered
            max_tokens=4096,
            # simple-evals by default use 0.5 and is better than 0.0 temperature
            # but here for reproducibility, we use 0.1
            temperature=0.1,
            reasoning_effort=reasoning_effort,
        )

        setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}"

        print(f"Evaluation start: {setup}")
        metrics = run_eval(args)
        print(f"Evaluation end: {setup} {metrics=}")
        self.assertGreaterEqual(metrics["score"], expected_score)

        if is_in_ci():
            write_github_step_summary(
                f"### test_gpt_oss_common\n"
                f"Setup: {setup}\n"
                f"Score: {metrics['score']:.2f}\n"
            )
Add tests to AMD CI for MI35x (#9662) Co-authored-by: Sai Enduri <saimanas.enduri@amd.com> 2025-09-10 12:50:05 -07:00			`import os`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`from concurrent.futures import ThreadPoolExecutor`
			`from types import SimpleNamespace`
			`from typing import Dict, List, Literal, Optional`

Add tests to AMD CI for MI35x (#9662) Co-authored-by: Sai Enduri <saimanas.enduri@amd.com> 2025-09-10 12:50:05 -07:00			`from sglang.srt.utils import is_hip, kill_process_tree`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`from sglang.test.run_eval import run_eval`
			`from sglang.test.test_utils import (`
			`DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`DEFAULT_URL_FOR_TEST,`
			`CustomTestCase,`
Fix CI (#9013) 2025-08-09 16:00:10 -07:00			`is_in_ci,`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`popen_launch_server,`
Fix CI (#9013) 2025-08-09 16:00:10 -07:00			`write_github_step_summary,`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`)`

			`_base_url = DEFAULT_URL_FOR_TEST`
Add tests to AMD CI for MI35x (#9662) Co-authored-by: Sai Enduri <saimanas.enduri@amd.com> 2025-09-10 12:50:05 -07:00			`_is_hip = is_hip()`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00

			`class BaseTestGptOss(CustomTestCase):`
			`def run_test(`
			`self,`
			`model_variant: Literal["20b", "120b"],`
			`quantization: Literal["mxfp4", "bf16"],`
			`expected_score_of_reasoning_effort: Dict[str, float],`
			`other_args: Optional[List[str]] = None,`
			`):`
			`if other_args is None:`
			`other_args = []`

			`model = {`
			`("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",`
			`("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",`
			`("20b", "mxfp4"): "openai/gpt-oss-20b",`
			`("120b", "mxfp4"): "openai/gpt-oss-120b",`
			`}[(model_variant, quantization)]`

			`if model_variant == "20b":`
			`other_args += ["--cuda-graph-max-bs", "600"]`
Add tests to AMD CI for MI35x (#9662) Co-authored-by: Sai Enduri <saimanas.enduri@amd.com> 2025-09-10 12:50:05 -07:00			`if _is_hip:`
			`os.environ["SGLANG_USE_AITER"] = "0"`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`self._run_test_raw(`
			`model=model,`
			`expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,`
			`other_args=other_args,`
			`)`

			`def _run_test_raw(`
			`self,`
			`model: str,`
			`expected_score_of_reasoning_effort: Dict[str, float],`
			`other_args: List[str],`
			`):`
			`process = popen_launch_server(`
			`model,`
			`_base_url,`
			`timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,`
			`other_args=other_args,`
			`)`

			`try:`
			`# run multiple tests in parallel since we are mostly bound by the longest generate sequence`
			`# instead of the number of questions`
			`with ThreadPoolExecutor(max_workers=4) as executor:`
			`list(`
			`executor.map(`
			`lambda d: self._run_one_eval(**d),`
			`[`
			`dict(`
			`model=model,`
			`reasoning_effort=reasoning_effort,`
			`expected_score=expected_score,`
			`)`
			`for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()`
			`],`
			`)`
			`)`
			`finally:`
			`kill_process_tree(process.pid)`

			`def _run_one_eval(self, model, reasoning_effort, expected_score):`
			`args = SimpleNamespace(`
			`base_url=_base_url,`
			`model=model,`
			`eval_name="gpqa",`
			`num_examples=198,`
			`# use enough threads to allow parallelism`
			`num_threads=198,`
			`# TODO 4k is still not enough, we need e.g. 64k token, but that is super slow`
			`# otherwise a lot of questions are not answered`
			`max_tokens=4096,`
			`# simple-evals by default use 0.5 and is better than 0.0 temperature`
			`# but here for reproducibility, we use 0.1`
			`temperature=0.1,`
			`reasoning_effort=reasoning_effort,`
			`)`

Fix CI (#9013) 2025-08-09 16:00:10 -07:00			`setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}"`

			`print(f"Evaluation start: {setup}")`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`metrics = run_eval(args)`
Fix CI (#9013) 2025-08-09 16:00:10 -07:00			`print(f"Evaluation end: {setup} {metrics=}")`
Add CI for gpt-oss model on hopper (#8851) 2025-08-09 15:34:23 +08:00			`self.assertGreaterEqual(metrics["score"], expected_score)`
Fix CI (#9013) 2025-08-09 16:00:10 -07:00
			`if is_in_ci():`
			`write_github_step_summary(`
			`f"### test_gpt_oss_common\n"`
			`f"Setup: {setup}\n"`
			`f"Score: {metrics['score']:.2f}\n"`
			`)`