[FEAT] Add transformers backend support (#5929)

2025-06-04 06:05:29 +02:00
parent 8a5480528d
commit 37f1547587
11 changed files with 636 additions and 3 deletions
--- a/test/srt/models/test_transformers_models.py
+++ b/test/srt/models/test_transformers_models.py
@@ -0,0 +1,181 @@
+import dataclasses
+import multiprocessing as mp
+import unittest
+from types import SimpleNamespace
+from typing import List
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestTransformersFallbackEndpoint(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--impl", "transformers"],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+        from sglang.test.run_eval import run_eval
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        from sglang.test.few_shot_gsm8k import run_eval
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound)
+
+
+class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--impl",
+                "transformers",
+                "--torchao-config",
+                "int4wo-128",
+            ],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+
+@dataclasses.dataclass
+class ModelCase:
+    model_path: str
+    tp_size: int = 1
+    prefill_tolerance: float = 5e-2
+    decode_tolerance: float = 5e-2
+    rouge_l_tolerance: float = 1
+    skip_long_prompt: bool = False
+    trust_remote_code: bool = False
+    torchao_config: str = None
+    torch_dtype: torch.dtype = torch.float16
+
+
+# Popular models that run on the CI
+CI_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST),
+]
+
+ALL_OTHER_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2),
+]
+
+
+class TestTransformersFallbackEngine(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_logits_and_output_strs(
+        self,
+        prompts: List[str],
+        model_case: ModelCase,
+    ) -> None:
+        model_path = model_case.model_path
+        max_new_tokens = 32
+        # force to use transformers impl
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            impl="transformers",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_transformers_outputs = srt_runner.forward(
+                prompts, max_new_tokens=max_new_tokens
+            )
+
+        check_close_model_outputs(
+            hf_outputs=srt_transformers_outputs,
+            srt_outputs=srt_outputs,
+            prefill_tolerance=model_case.prefill_tolerance,
+            decode_tolerance=model_case.decode_tolerance,
+            rouge_l_tolerance=model_case.rouge_l_tolerance,
+            debug_text=f"model_path={model_path} prompts={prompts}",
+        )
+
+    def test_ci_models(self):
+        for model_case in CI_MODELS:
+            # Skip long prompts for models that do not have a long context
+            prompts = DEFAULT_PROMPTS
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+    def test_others(self):
+        if is_in_ci():
+            return
+
+        # Skip long prompts for models that do not have a long context
+        prompts = DEFAULT_PROMPTS
+        for model_case in ALL_OTHER_MODELS:
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -26,6 +26,7 @@ suites = {
        TestFile("models/test_qwen_models.py", 82),
        TestFile("models/test_reward_models.py", 132),
        TestFile("models/test_vlm_models.py", 437),
+        TestFile("models/test_transformers_models.py", 320),
        TestFile("test_abort.py", 51),
        TestFile("test_block_int8.py", 22),
        TestFile("test_create_kvindices.py", 2),