Add CI for gpt-oss model on hopper (#8851)

2025-08-09 15:34:23 +08:00
parent de8b8b6e5c
commit 442534aa44
7 changed files with 187 additions and 2 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -63,6 +63,7 @@ suites = {
        TestFile("test_fp8_kernel.py", 8),
        TestFile("test_function_call_parser.py", 10),
        TestFile("test_fused_moe.py", 30),
+        TestFile("test_gpt_oss_1gpu.py", 600),
        TestFile("test_hicache.py", 116),
        TestFile("test_hicache_mla.py", 127),
        TestFile("test_hicache_storage.py", 127),
@@ -104,7 +105,7 @@ suites = {
        TestFile("test_utils_update_weights.py", 48),
        TestFile("test_vision_chunked_prefill.py", 175),
        TestFile("test_vlm_input_format.py", 300),
-        TestFile("test_vision_openai_server_a.py", 584),
+        TestFile("test_vision_openai_server_a.py", 989),
        TestFile("test_vision_openai_server_b.py", 620),
        TestFile("test_w8a8_quantization.py", 46),
        TestFile("test_reasoning_parser.py", 5),
@@ -176,6 +177,7 @@ suites = {
        TestFile("test_update_weights_from_distributed.py", 103),
    ],
    "per-commit-4-gpu": [
+        TestFile("test_gpt_oss_4gpu.py", 600),
        TestFile("test_local_attn.py", 250),
        TestFile("test_pp_single_node.py", 372),
        TestFile("test_multi_instance_release_memory_occupation.py", 64),
--- a/test/srt/test_gpt_oss_1gpu.py
+++ b/test/srt/test_gpt_oss_1gpu.py
@@ -0,0 +1,31 @@
+import unittest
+
+from test_gpt_oss_common import BaseTestGptOss
+
+
+class TestGptOss1Gpu(BaseTestGptOss):
+    def test_mxfp4_20b(self):
+        self.run_test(
+            model_variant="20b",
+            quantization="mxfp4",
+            expected_score_of_reasoning_effort={
+                "low": 0.38,
+                "medium": 0.38,
+                "high": 0.29,  # TODO investigate
+            },
+        )
+
+    def test_bf16_20b(self):
+        self.run_test(
+            model_variant="20b",
+            quantization="bf16",
+            expected_score_of_reasoning_effort={
+                "low": 0.38,
+                "medium": 0.38,
+                "high": 0.29,  # TODO investigate
+            },
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_gpt_oss_4gpu.py
+++ b/test/srt/test_gpt_oss_4gpu.py
@@ -0,0 +1,42 @@
+import unittest
+
+from test_gpt_oss_common import BaseTestGptOss
+
+
+class TestGptOss4Gpu(BaseTestGptOss):
+    def test_bf16_120b(self):
+        self.run_test(
+            model_variant="120b",
+            quantization="bf16",
+            expected_score_of_reasoning_effort={
+                "low": 0.61,
+                # remove to speed up
+                # "medium": 0.61,
+                # "high": 0.61,
+            },
+            other_args=["--tp", "4", "--cuda-graph-max-bs", "200"],
+        )
+
+    def test_mxfp4_120b(self):
+        self.run_test(
+            model_variant="120b",
+            quantization="mxfp4",
+            expected_score_of_reasoning_effort={
+                "low": 0.61,
+                # remove to speed up
+                # "medium": 0.61,
+                # "high": 0.61,
+            },
+            other_args=[
+                "--tp",
+                "4",
+                "--cuda-graph-max-bs",
+                "200",
+                "--mem-fraction-static",
+                "0.93",
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_gpt_oss_common.py
+++ b/test/srt/test_gpt_oss_common.py
@@ -0,0 +1,99 @@
+from concurrent.futures import ThreadPoolExecutor
+from types import SimpleNamespace
+from typing import Dict, List, Literal, Optional
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+_base_url = DEFAULT_URL_FOR_TEST
+
+
+class BaseTestGptOss(CustomTestCase):
+    def run_test(
+        self,
+        model_variant: Literal["20b", "120b"],
+        quantization: Literal["mxfp4", "bf16"],
+        expected_score_of_reasoning_effort: Dict[str, float],
+        other_args: Optional[List[str]] = None,
+    ):
+        if other_args is None:
+            other_args = []
+
+        model = {
+            ("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
+            ("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
+            ("20b", "mxfp4"): "openai/gpt-oss-20b",
+            ("120b", "mxfp4"): "openai/gpt-oss-120b",
+        }[(model_variant, quantization)]
+
+        if model_variant == "20b":
+            other_args += ["--cuda-graph-max-bs", "600"]
+
+        self._run_test_raw(
+            model=model,
+            expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
+            other_args=other_args,
+        )
+
+    def _run_test_raw(
+        self,
+        model: str,
+        expected_score_of_reasoning_effort: Dict[str, float],
+        other_args: List[str],
+    ):
+        process = popen_launch_server(
+            model,
+            _base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+
+        try:
+            # run multiple tests in parallel since we are mostly bound by the longest generate sequence
+            # instead of the number of questions
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                list(
+                    executor.map(
+                        lambda d: self._run_one_eval(**d),
+                        [
+                            dict(
+                                model=model,
+                                reasoning_effort=reasoning_effort,
+                                expected_score=expected_score,
+                            )
+                            for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
+                        ],
+                    )
+                )
+        finally:
+            kill_process_tree(process.pid)
+
+    def _run_one_eval(self, model, reasoning_effort, expected_score):
+        args = SimpleNamespace(
+            base_url=_base_url,
+            model=model,
+            eval_name="gpqa",
+            num_examples=198,
+            # use enough threads to allow parallelism
+            num_threads=198,
+            # TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
+            # otherwise a lot of questions are not answered
+            max_tokens=4096,
+            # simple-evals by default use 0.5 and is better than 0.0 temperature
+            # but here for reproducibility, we use 0.1
+            temperature=0.1,
+            reasoning_effort=reasoning_effort,
+        )
+
+        print(f"Evaluation start: {model=} {reasoning_effort=} {expected_score=}")
+        metrics = run_eval(args)
+        print(
+            f"Evaluation end: {model=} {reasoning_effort=} {expected_score=} {metrics=}"
+        )
+        self.assertGreaterEqual(metrics["score"], expected_score)