Add CI for gpt-oss model on hopper (#8851)

2025-08-09 15:34:23 +08:00
parent de8b8b6e5c
commit 442534aa44
7 changed files with 187 additions and 2 deletions
--- a/test/srt/test_gpt_oss_common.py
+++ b/test/srt/test_gpt_oss_common.py
@@ -0,0 +1,99 @@
+from concurrent.futures import ThreadPoolExecutor
+from types import SimpleNamespace
+from typing import Dict, List, Literal, Optional
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+_base_url = DEFAULT_URL_FOR_TEST
+
+
+class BaseTestGptOss(CustomTestCase):
+    def run_test(
+        self,
+        model_variant: Literal["20b", "120b"],
+        quantization: Literal["mxfp4", "bf16"],
+        expected_score_of_reasoning_effort: Dict[str, float],
+        other_args: Optional[List[str]] = None,
+    ):
+        if other_args is None:
+            other_args = []
+
+        model = {
+            ("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
+            ("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
+            ("20b", "mxfp4"): "openai/gpt-oss-20b",
+            ("120b", "mxfp4"): "openai/gpt-oss-120b",
+        }[(model_variant, quantization)]
+
+        if model_variant == "20b":
+            other_args += ["--cuda-graph-max-bs", "600"]
+
+        self._run_test_raw(
+            model=model,
+            expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
+            other_args=other_args,
+        )
+
+    def _run_test_raw(
+        self,
+        model: str,
+        expected_score_of_reasoning_effort: Dict[str, float],
+        other_args: List[str],
+    ):
+        process = popen_launch_server(
+            model,
+            _base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+        try:
+            # run multiple tests in parallel since we are mostly bound by the longest generate sequence
+            # instead of the number of questions
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                list(
+                    executor.map(
+                        lambda d: self._run_one_eval(**d),
+                        [
+                            dict(
+                                model=model,
+                                reasoning_effort=reasoning_effort,
+                                expected_score=expected_score,
+                            )
+                            for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
+                        ],
+                    )
+                )
+        finally:
+            kill_process_tree(process.pid)
+
+    def _run_one_eval(self, model, reasoning_effort, expected_score):
+        args = SimpleNamespace(
+            base_url=_base_url,
+            model=model,
+            eval_name="gpqa",
+            num_examples=198,
+            # use enough threads to allow parallelism
+            num_threads=198,
+            # TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
+            # otherwise a lot of questions are not answered
+            max_tokens=4096,
+            # simple-evals by default use 0.5 and is better than 0.0 temperature
+            # but here for reproducibility, we use 0.1
+            temperature=0.1,
+            reasoning_effort=reasoning_effort,
+        )
+
+        print(f"Evaluation start: {model=} {reasoning_effort=} {expected_score=}")
+        metrics = run_eval(args)
+        print(
+            f"Evaluation end: {model=} {reasoning_effort=} {expected_score=} {metrics=}"
+        )
+        self.assertGreaterEqual(metrics["score"], expected_score)