Add CI for gpt-oss model on hopper (#8851)
This commit is contained in:
@@ -63,6 +63,7 @@ suites = {
|
||||
TestFile("test_fp8_kernel.py", 8),
|
||||
TestFile("test_function_call_parser.py", 10),
|
||||
TestFile("test_fused_moe.py", 30),
|
||||
TestFile("test_gpt_oss_1gpu.py", 600),
|
||||
TestFile("test_hicache.py", 116),
|
||||
TestFile("test_hicache_mla.py", 127),
|
||||
TestFile("test_hicache_storage.py", 127),
|
||||
@@ -104,7 +105,7 @@ suites = {
|
||||
TestFile("test_utils_update_weights.py", 48),
|
||||
TestFile("test_vision_chunked_prefill.py", 175),
|
||||
TestFile("test_vlm_input_format.py", 300),
|
||||
TestFile("test_vision_openai_server_a.py", 584),
|
||||
TestFile("test_vision_openai_server_a.py", 989),
|
||||
TestFile("test_vision_openai_server_b.py", 620),
|
||||
TestFile("test_w8a8_quantization.py", 46),
|
||||
TestFile("test_reasoning_parser.py", 5),
|
||||
@@ -176,6 +177,7 @@ suites = {
|
||||
TestFile("test_update_weights_from_distributed.py", 103),
|
||||
],
|
||||
"per-commit-4-gpu": [
|
||||
TestFile("test_gpt_oss_4gpu.py", 600),
|
||||
TestFile("test_local_attn.py", 250),
|
||||
TestFile("test_pp_single_node.py", 372),
|
||||
TestFile("test_multi_instance_release_memory_occupation.py", 64),
|
||||
|
||||
31
test/srt/test_gpt_oss_1gpu.py
Normal file
31
test/srt/test_gpt_oss_1gpu.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import unittest
|
||||
|
||||
from test_gpt_oss_common import BaseTestGptOss
|
||||
|
||||
|
||||
class TestGptOss1Gpu(BaseTestGptOss):
|
||||
def test_mxfp4_20b(self):
|
||||
self.run_test(
|
||||
model_variant="20b",
|
||||
quantization="mxfp4",
|
||||
expected_score_of_reasoning_effort={
|
||||
"low": 0.38,
|
||||
"medium": 0.38,
|
||||
"high": 0.29, # TODO investigate
|
||||
},
|
||||
)
|
||||
|
||||
def test_bf16_20b(self):
|
||||
self.run_test(
|
||||
model_variant="20b",
|
||||
quantization="bf16",
|
||||
expected_score_of_reasoning_effort={
|
||||
"low": 0.38,
|
||||
"medium": 0.38,
|
||||
"high": 0.29, # TODO investigate
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
42
test/srt/test_gpt_oss_4gpu.py
Normal file
42
test/srt/test_gpt_oss_4gpu.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import unittest
|
||||
|
||||
from test_gpt_oss_common import BaseTestGptOss
|
||||
|
||||
|
||||
class TestGptOss4Gpu(BaseTestGptOss):
|
||||
def test_bf16_120b(self):
|
||||
self.run_test(
|
||||
model_variant="120b",
|
||||
quantization="bf16",
|
||||
expected_score_of_reasoning_effort={
|
||||
"low": 0.61,
|
||||
# remove to speed up
|
||||
# "medium": 0.61,
|
||||
# "high": 0.61,
|
||||
},
|
||||
other_args=["--tp", "4", "--cuda-graph-max-bs", "200"],
|
||||
)
|
||||
|
||||
def test_mxfp4_120b(self):
|
||||
self.run_test(
|
||||
model_variant="120b",
|
||||
quantization="mxfp4",
|
||||
expected_score_of_reasoning_effort={
|
||||
"low": 0.61,
|
||||
# remove to speed up
|
||||
# "medium": 0.61,
|
||||
# "high": 0.61,
|
||||
},
|
||||
other_args=[
|
||||
"--tp",
|
||||
"4",
|
||||
"--cuda-graph-max-bs",
|
||||
"200",
|
||||
"--mem-fraction-static",
|
||||
"0.93",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
99
test/srt/test_gpt_oss_common.py
Normal file
99
test/srt/test_gpt_oss_common.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, List, Literal, Optional
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.run_eval import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
_base_url = DEFAULT_URL_FOR_TEST
|
||||
|
||||
|
||||
class BaseTestGptOss(CustomTestCase):
|
||||
def run_test(
|
||||
self,
|
||||
model_variant: Literal["20b", "120b"],
|
||||
quantization: Literal["mxfp4", "bf16"],
|
||||
expected_score_of_reasoning_effort: Dict[str, float],
|
||||
other_args: Optional[List[str]] = None,
|
||||
):
|
||||
if other_args is None:
|
||||
other_args = []
|
||||
|
||||
model = {
|
||||
("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
|
||||
("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
|
||||
("20b", "mxfp4"): "openai/gpt-oss-20b",
|
||||
("120b", "mxfp4"): "openai/gpt-oss-120b",
|
||||
}[(model_variant, quantization)]
|
||||
|
||||
if model_variant == "20b":
|
||||
other_args += ["--cuda-graph-max-bs", "600"]
|
||||
|
||||
self._run_test_raw(
|
||||
model=model,
|
||||
expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
|
||||
other_args=other_args,
|
||||
)
|
||||
|
||||
def _run_test_raw(
|
||||
self,
|
||||
model: str,
|
||||
expected_score_of_reasoning_effort: Dict[str, float],
|
||||
other_args: List[str],
|
||||
):
|
||||
process = popen_launch_server(
|
||||
model,
|
||||
_base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=other_args,
|
||||
)
|
||||
|
||||
try:
|
||||
# run multiple tests in parallel since we are mostly bound by the longest generate sequence
|
||||
# instead of the number of questions
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
list(
|
||||
executor.map(
|
||||
lambda d: self._run_one_eval(**d),
|
||||
[
|
||||
dict(
|
||||
model=model,
|
||||
reasoning_effort=reasoning_effort,
|
||||
expected_score=expected_score,
|
||||
)
|
||||
for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
|
||||
],
|
||||
)
|
||||
)
|
||||
finally:
|
||||
kill_process_tree(process.pid)
|
||||
|
||||
def _run_one_eval(self, model, reasoning_effort, expected_score):
|
||||
args = SimpleNamespace(
|
||||
base_url=_base_url,
|
||||
model=model,
|
||||
eval_name="gpqa",
|
||||
num_examples=198,
|
||||
# use enough threads to allow parallelism
|
||||
num_threads=198,
|
||||
# TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
|
||||
# otherwise a lot of questions are not answered
|
||||
max_tokens=4096,
|
||||
# simple-evals by default use 0.5 and is better than 0.0 temperature
|
||||
# but here for reproducibility, we use 0.1
|
||||
temperature=0.1,
|
||||
reasoning_effort=reasoning_effort,
|
||||
)
|
||||
|
||||
print(f"Evaluation start: {model=} {reasoning_effort=} {expected_score=}")
|
||||
metrics = run_eval(args)
|
||||
print(
|
||||
f"Evaluation end: {model=} {reasoning_effort=} {expected_score=} {metrics=}"
|
||||
)
|
||||
self.assertGreaterEqual(metrics["score"], expected_score)
|
||||
Reference in New Issue
Block a user