sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

2025-09-13 17:00:20 +08:00
commit 118f1fc726
2037 changed files with 515371 additions and 0 deletions
--- a/test/srt/models/compare.py
+++ b/test/srt/models/compare.py
@@ -0,0 +1,52 @@
+"""
+used for debug using tensor comparison
+dump {name: tensor} into "log_hf.jsonl" and "log_srt.jsonl"
+use the same name for two tensors that supposed to be close
+recommend name like: "layer 2 after mlp"
+"""
+
+import json
+import sys
+
+import torch
+
+if len(sys.argv) > 1:
+    assert sys.argv[1] == "base"
+    hf_log = "base_log_hf.jsonl"
+    srt_log = "base_log_srt.jsonl"
+else:
+    hf_log = "log_hf.jsonl"
+    srt_log = "log_srt.jsonl"
+
+
+def load_data(filepath):
+    tensors = {}
+    with open(filepath, "r") as f:
+        lines = f.readlines()
+        for line in lines:
+            data = json.loads(line)
+            for k, v in data.items():
+                tensors[k] = torch.tensor(v)
+    return tensors
+
+
+hf_tensors = load_data(hf_log)
+srt_tensors = load_data(srt_log)
+
+
+def get_diff(t1, t2):
+    t1 = t1.reshape(t2.shape)
+    max_diff = torch.max(abs(t1.reshape(t2.shape) - t2))
+    l2_dis = torch.dist(t1, t2, p=2)
+    return l2_dis, max_diff
+
+
+for k, _ in srt_tensors.items():
+    l2_dis, max_diff = get_diff(hf_tensors[k], srt_tensors[k])
+    print(f"{k} {l2_dis=} {max_diff=}")
+    if k == "layer 1 attn":
+        print(hf_tensors[k])
+        print(srt_tensors[k])
+    if k == "layer 0 prefill k":
+        print(srt_tensors[k].shape)
+        print(hf_tensors[k].shape)
--- a/test/srt/models/test_clip_models.py
+++ b/test/srt/models/test_clip_models.py
@@ -0,0 +1,80 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import unittest
+
+import torch
+from transformers import AutoProcessor
+
+from sglang.srt.utils import load_image
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import get_similarities
+
+TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
+IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+MODELS = [
+    ("openai/clip-vit-large-patch14-336", 1e-5),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestClipModels(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
+
+        with HFRunner(
+            model,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            hf_text_embeds = hf_runner.forward(prompts=TEXTS)
+            hf_image_embeds = hf_runner.forward(image_data=IMAGES)
+
+        with SRTRunner(
+            model,
+            tp_size=1,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as srt_runner:
+            text_embeds = srt_runner.forward(prompts=TEXTS)
+            image_embeds = srt_runner.forward(prompts="padding", image_data=IMAGES)
+
+        text_similarity = get_similarities(
+            text_embeds.embed_logits[0], hf_text_embeds.embed_logits[0]
+        )
+        image_similarity = get_similarities(
+            image_embeds.embed_logits[0], hf_image_embeds.embed_logits[0]
+        )
+        print("text similarity diff", abs(text_similarity - 1))
+        print("image similarity diff", abs(image_similarity - 1))
+        assert torch.all(
+            abs(text_similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+        assert torch.all(
+            abs(image_similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+
+    def test_accuracy(self):
+        for model, prefill_tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_compressed_tensors_models.py
+++ b/test/srt/models/test_compressed_tensors_models.py
@@ -0,0 +1,46 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestCompressedTensorsLlama3FP8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "RedHatAI/Meta-Llama-3.1-8B-FP8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreaterEqual(metrics["accuracy"], 0.45)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_cross_encoder_models.py
+++ b/test/srt/models/test_cross_encoder_models.py
@@ -0,0 +1,91 @@
+import multiprocessing as mp
+import random
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.runners import TEST_RERANK_QUERY_DOCS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+MODELS = [
+    ("cross-encoder/ms-marco-MiniLM-L6-v2", 1, 1e-2),
+    ("BAAI/bge-reranker-v2-m3", 1, 1e-2),
+]
+ATTENTION_BACKEND = ["torch_native", "triton"]
+
+TORCH_DTYPES = [torch.float32]
+
+
+class TestCrossEncoderModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        score_tolerance,
+        attention_backend,
+    ) -> None:
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="cross_encoder",
+        ) as hf_runner:
+            hf_scores = hf_runner.forward(prompts).scores
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="cross_encoder",
+            attention_backend=attention_backend,
+            chunked_prefill_size=-1,
+            disable_radix_cache=True,
+        ) as srt_runner:
+            srt_scores = srt_runner.forward(prompts).scores
+
+        for i in range(len(srt_scores)):
+            score_difference = abs(hf_scores[i] - srt_scores[i])
+
+            assert (
+                score_difference < score_tolerance
+            ), "cross encoder scores are not all close"
+
+    def preprocess_prompts(self, prompt):
+        processed_prompts = []
+        query = prompt["query"]
+        documents = prompt["documents"]
+        for document in documents:
+            processed_prompts.append([query, document])
+
+        return processed_prompts
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for attention_backend in ATTENTION_BACKEND:
+                for queryDocs in TEST_RERANK_QUERY_DOCS:
+                    prompts = self.preprocess_prompts(queryDocs)
+                    for torch_dtype in TORCH_DTYPES:
+                        self.assert_close_prefill_logits(
+                            prompts,
+                            model,
+                            tp_size,
+                            torch_dtype,
+                            prefill_tolerance,
+                            attention_backend,
+                        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_dummy_grok_models.py
+++ b/test/srt/models/test_dummy_grok_models.py
@@ -0,0 +1,34 @@
+import unittest
+
+from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch
+
+
+class TestDummyGrok1(CustomTestCase):
+
+    def test_dummy_grok_1(self):
+        _, output_throughput, _ = run_bench_one_batch(
+            None,
+            [
+                "--model",
+                "/dummy-grok",
+                "--tokenizer-path",
+                "Xenova/grok-1-tokenizer",
+                "--batch-size",
+                "2",
+                "--tp",
+                "2",
+                "--quantization",
+                "fp8",
+                "--load-format",
+                "dummy",
+                "--json-model-override-args",
+                '{"num_hidden_layers": 2}',
+            ],
+        )
+
+        if is_in_ci():
+            self.assertGreater(output_throughput, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_embedding_models.py
+++ b/test/srt/models/test_embedding_models.py
@@ -0,0 +1,111 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import random
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
+
+MODELS = [
+    ("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5),
+    ("intfloat/e5-mistral-7b-instruct", 1, 1e-5),
+    ("marco/mcdse-2b-v1", 1, 1e-5),
+    ("Qwen/Qwen3-Embedding-8B", 1, 1e-5),
+    # Temporarily disable before this model is fixed
+    # ("jason9693/Qwen2.5-1.5B-apeach", 1, 1e-5),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestEmbeddingModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def _truncate_prompts(self, prompts, model_path):
+        config = AutoConfig.from_pretrained(model_path)
+        max_length = getattr(config, "max_position_embeddings", 2048)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        truncated_prompts = []
+        for prompt in prompts:
+            tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
+            if len(tokens.input_ids[0]) > max_length:
+                truncated_text = tokenizer.decode(
+                    tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
+                )
+                truncated_prompts.append(truncated_text)
+            else:
+                truncated_prompts.append(prompt)
+        return truncated_prompts
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        prefill_tolerance,
+    ) -> None:
+        truncated_prompts = self._truncate_prompts(prompts, model_path)
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(truncated_prompts)
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(truncated_prompts)
+
+        for i in range(len(prompts)):
+            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
+            srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
+
+            similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
+            print("similarity diff", abs(similarity - 1))
+
+            if len(prompts[i]) <= 1000:
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_prefill_logits(
+                    DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_encoder_embedding_models.py
+++ b/test/srt/models/test_encoder_embedding_models.py
@@ -0,0 +1,162 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# python -m unittest test_encoder_embedding_models.TestEncoderEmbeddingModels.test_prefill_logits
+
+import multiprocessing as mp
+import random
+import time
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
+
+MODELS = [("BAAI/bge-small-en", 1, 1e-5), ("BAAI/bge-m3", 1, 1e-5)]
+
+ATTENTION_BACKEND = ["torch_native", "triton", "flashinfer"]
+BATCH_SIZE = [1, 2]
+TORCH_DTYPES = [torch.float32, torch.float16]
+sgl_to_st_ratio = []
+
+
+class TestEncoderEmbeddingModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def _truncate_prompts(self, prompts, model_path):
+        config = AutoConfig.from_pretrained(model_path)
+        max_length = getattr(config, "max_position_embeddings", 512) - 20
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        truncated_prompts = []
+        for prompt in prompts:
+            tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
+            if len(tokens.input_ids[0]) > max_length:
+                truncated_text = tokenizer.decode(
+                    tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
+                )
+                truncated_prompts.append(truncated_text)
+            else:
+                truncated_prompts.append(prompt)
+
+        return truncated_prompts
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        prefill_tolerance,
+        attention_backend,
+        batch_size,
+    ) -> None:
+        truncated_prompts = self._truncate_prompts(prompts, model_path)
+        truncated_prompts = truncated_prompts * batch_size
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            # warm up
+            hf_outputs = hf_runner.forward(truncated_prompts)
+
+            st_start_time = time.perf_counter()
+            hf_outputs = hf_runner.forward(truncated_prompts)
+            st_end_time = time.perf_counter()
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            attention_backend=attention_backend,
+            chunked_prefill_size=-1,
+            disable_radix_cache=True,
+        ) as srt_runner:
+            # warm up
+            srt_outputs = srt_runner.forward(truncated_prompts)
+
+            sgl_start_time = time.perf_counter()
+            srt_outputs = srt_runner.forward(truncated_prompts)
+            sgl_end_time = time.perf_counter()
+
+        transformer_time = st_end_time - st_start_time
+        sgl_time = sgl_end_time - sgl_start_time
+        sgl_to_st_ratio.append(sgl_time / transformer_time)
+
+        for i in range(len(truncated_prompts)):
+            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
+            srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
+
+            similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
+            # If something is wrong, uncomment this to observe similarity.
+            # print("similarity diff", abs(similarity - 1))
+
+            if len(truncated_prompts[i]) <= 1000:
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for attention_backend in ATTENTION_BACKEND:
+                for batch_size in BATCH_SIZE:
+                    for torch_dtype in TORCH_DTYPES:
+                        # NOTE: FlashInfer currently has limitations with head_dim = 32 or
+                        # other dimensions.
+                        # The FlashInfer head_dim limitation itself is tracked here:
+                        # https://github.com/flashinfer-ai/flashinfer/issues/1048
+                        #
+                        # Flashinfer does not support torch.float32 for dtype_q, so skip it
+                        if attention_backend == "flashinfer":
+                            if (
+                                model == "BAAI/bge-small-en"
+                                or torch_dtype == torch.float32
+                            ):
+                                continue
+
+                        self.assert_close_prefill_logits(
+                            DEFAULT_PROMPTS,
+                            model,
+                            tp_size,
+                            torch_dtype,
+                            prefill_tolerance,
+                            attention_backend,
+                            batch_size,
+                        )
+
+        for i in range(len(BATCH_SIZE)):
+            print(
+                "bacth size: ",
+                BATCH_SIZE[i] * 5,
+                "sgl_time/st_time",
+                round(sgl_to_st_ratio[i], 3),
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_generation_models.py
+++ b/test/srt/models/test_generation_models.py
@@ -0,0 +1,181 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Usage:
+
+To test a specific model locally:
+1. Add it to ALL_MODELS, for example, `ModelCase("Qwen/Qwen2-1.5B")`
+2. Run `ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels`
+"""
+
+import dataclasses
+import multiprocessing as mp
+import os
+import random
+import unittest
+from typing import List
+
+import torch
+
+from sglang.test.runners import (
+    DEFAULT_PROMPTS,
+    HFRunner,
+    SRTRunner,
+    check_close_model_outputs,
+)
+from sglang.test.test_utils import CustomTestCase, is_in_ci
+
+
+@dataclasses.dataclass
+class ModelCase:
+    model_path: str
+    tp_size: int = 1
+    prefill_tolerance: float = 5e-2
+    decode_tolerance: float = 6e-2  # Increased to fix numerical error in issue #8614.
+    rouge_l_tolerance: float = 1
+    skip_long_prompt: bool = False
+    trust_remote_code: bool = False
+
+
+# Popular models that run on the CI
+CI_MODELS = [
+    ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
+    ModelCase("google/gemma-2-2b"),
+]
+
+# the complete set of models to test sglang's generation model
+ALL_MODELS = [
+    *CI_MODELS,
+    ModelCase("Qwen/Qwen2-1.5B"),
+    ModelCase("Qwen/Qwen2.5-14B-Instruct"),
+    ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
+    ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
+    ModelCase(
+        "THUDM/glm-4-9b-chat", tp_size=2, trust_remote_code=True, skip_long_prompt=True
+    ),
+    ModelCase("openai-community/gpt2"),
+    ModelCase("microsoft/phi-1_5", trust_remote_code=True),
+    ModelCase("adept/persimmon-8b-chat"),
+    ModelCase("inclusionAI/Ling-lite", trust_remote_code=True),
+    ModelCase("microsoft/Phi-3-small-8k-instruct", trust_remote_code=True),
+    ModelCase("allenai/OLMo-2-1124-7B-Instruct", skip_long_prompt=True),
+    ModelCase("ibm-granite/granite-3.0-2b-instruct", skip_long_prompt=True),
+    ModelCase(
+        "microsoft/Phi-3.5-MoE-instruct",
+        tp_size=2,
+        trust_remote_code=True,
+        skip_long_prompt=True,
+    ),
+    ModelCase("facebook/opt-125m", skip_long_prompt=True),
+    ModelCase(
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
+        tp_size=2,
+        trust_remote_code=True,
+        skip_long_prompt=True,
+    ),
+    ModelCase(
+        "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
+        tp_size=8,
+        trust_remote_code=True,
+        skip_long_prompt=True,
+    ),
+]
+
+TORCH_DTYPES = [torch.float16]
+
+
+class TestGenerationModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_logits_and_output_strs(
+        self,
+        prompts: List[str],
+        model_case: ModelCase,
+        torch_dtype: torch.dtype,
+    ) -> None:
+        model_path = model_case.model_path
+        prefill_tolerance, decode_tolerance, rouge_l_tolerance = (
+            model_case.prefill_tolerance,
+            model_case.decode_tolerance,
+            model_case.rouge_l_tolerance,
+        )
+        max_new_tokens = 32
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        check_close_model_outputs(
+            hf_outputs=hf_outputs,
+            srt_outputs=srt_outputs,
+            prefill_tolerance=model_case.prefill_tolerance,
+            decode_tolerance=model_case.decode_tolerance,
+            rouge_l_tolerance=model_case.rouge_l_tolerance,
+            debug_text=f"model_path={model_path} prompts={prompts}",
+        )
+
+    @unittest.skipIf(not is_in_ci(), "Local test should run all models")
+    def test_ci_models(self):
+        for model_case in CI_MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                prompts = DEFAULT_PROMPTS
+
+                # Skip long prompts for models that do not have a long context
+                if model_case.skip_long_prompt:
+                    prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+                # Assert the logits and output strs are close
+                self.assert_close_logits_and_output_strs(
+                    prompts, model_case, torch_dtype
+                )
+
+    @unittest.skipIf(is_in_ci(), "CI only runs selected models for simplicity")
+    def test_all_models(self):
+        for model_case in ALL_MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                if (
+                    "ONLY_RUN" in os.environ
+                    and os.environ["ONLY_RUN"] != model_case.model_path
+                ):
+                    continue
+
+                # Skip long prompts for models that do not have a long context
+                prompts = DEFAULT_PROMPTS
+                if model_case.skip_long_prompt:
+                    prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+                # Assert the logits and output strs are close
+                self.assert_close_logits_and_output_strs(
+                    prompts, model_case, torch_dtype
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_gme_qwen_models.py
+++ b/test/srt/models/test_gme_qwen_models.py
@@ -0,0 +1,85 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import multiprocessing as mp
+import unittest
+
+import torch
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities
+
+TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
+IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+
+
+MODELS = [
+    ("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", 1e-3),
+]
+TORCH_DTYPES = [torch.float16]
+
+
+class TestQmeQwenModels(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
+
+        prompts_no_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{TEXTS}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
+        prompts_with_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
+        with HFRunner(
+            model,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as hf_runner:
+            hf_text_embeddings = hf_runner.forward(prompts=[prompts_no_image])
+            hf_image_embeddings = hf_runner.forward(
+                prompts=[prompts_with_image], image_data=[IMAGES]
+            )
+        with SRTRunner(
+            model,
+            tp_size=1,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+        ) as srt_runner:
+            srt_text_embeddings = srt_runner.forward(prompts=prompts_no_image)
+            srt_image_embeddings = srt_runner.forward(
+                prompts=prompts_with_image, image_data=IMAGES
+            )
+
+        similarity = get_similarities(
+            hf_text_embeddings.embed_logits[0], srt_text_embeddings.embed_logits[0]
+        )
+        print("texts similarity diff", abs(similarity - 1))
+        assert torch.all(
+            abs(similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+        similarity = get_similarities(
+            hf_image_embeddings.embed_logits[0], srt_image_embeddings.embed_logits[0]
+        )
+        print("images similarity diff", abs(similarity - 1))
+        assert torch.all(
+            abs(similarity - 1) < prefill_tolerance
+        ), "embeddings are not all close"
+
+    def test_accuracy(self):
+        for model, prefill_tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_grok_models.py
+++ b/test/srt/models/test_grok_models.py
@@ -0,0 +1,53 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestGrok(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "lmzheng/grok-1"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "dummy",
+                "--json-model-override-args",
+                '{"num_hidden_layers": 2}',
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=64,
+            max_new_tokens=256,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+
+        # It is dummy weights so we only assert the output throughput instead of accuracy.
+        self.assertGreater(metrics["output_throughput"], 1000)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_llama4_models.py
+++ b/test/srt/models/test_llama4_models.py
@@ -0,0 +1,74 @@
+import random
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+MODELS = [
+    SimpleNamespace(
+        model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        accuracy=0.9,
+        tp_size=4,
+    ),
+]
+
+
+class TestLlama4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+
+    def test_gsm8k(self):
+
+        for model in MODELS:
+            try:
+                process = popen_launch_server(
+                    model.model,
+                    self.base_url,
+                    timeout=3 * DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        "--chat-template",
+                        "llama-4",
+                        "--tp-size",
+                        str(model.tp_size),
+                        "--mem-fraction-static",
+                        "0.8",
+                        "--context-length",
+                        "8192",
+                    ],
+                )
+                args = SimpleNamespace(
+                    num_shots=5,
+                    data_path=None,
+                    num_questions=200,
+                    max_new_tokens=512,
+                    parallel=128,
+                    host="http://127.0.0.1",
+                    port=int(self.base_url.split(":")[-1]),
+                )
+                metrics = run_eval(args)
+                print(f"{metrics=}")
+                self.assertGreaterEqual(metrics["accuracy"], model.accuracy)
+            except Exception as e:
+                print(f"Error testing {model.model}: {e}")
+                self.fail(f"Test failed for {model.model}: {e}")
+
+            finally:
+                # Ensure process cleanup happens regardless of success/failure
+                if process is not None and process.poll() is None:
+                    print(f"Cleaning up process {process.pid}")
+                    try:
+                        kill_process_tree(process.pid)
+                    except Exception as e:
+                        print(f"Error killing process: {e}")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_mtp_models.py
+++ b/test/srt/models/test_mtp_models.py
@@ -0,0 +1,58 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestMiMoMTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "XiaomiMiMo/MiMo-7B-RL"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-num-steps",
+                "1",
+                "--speculative-eagle-topk",
+                "1",
+                "--speculative-num-draft-tokens",
+                "2",
+                "--mem-fraction-static",
+                "0.5",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.7)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_qwen_models.py
+++ b/test/srt/models/test_qwen_models.py
@@ -0,0 +1,77 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestQwen2(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.78)
+
+
+class TestQwen2FP8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "neuralmagic/Qwen2-7B-Instruct-FP8"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.78)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_reward_models.py
+++ b/test/srt/models/test_reward_models.py
@@ -0,0 +1,92 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import multiprocessing as mp
+import unittest
+
+import torch
+
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase
+
+MODELS = [
+    ("LxzGordon/URM-LLaMa-3.1-8B", 1, 4e-2),
+    ("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", 1, 4e-2),
+]
+TORCH_DTYPES = [torch.float16]
+
+# PROMPT = "Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits all her apples equally among herself and her 2 siblings. How many apples does each person get?"
+# RESPONSE1 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among herself and her 2 siblings (3 people in total). 9 ÷ 3 = 3 apples each. Each person gets 3 apples."
+# RESPONSE2 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among her 2 siblings (2 people in total). 9 ÷ 2 = 4.5 apples each. Each person gets 4 apples."
+
+PROMPT = (
+    "What is the range of the numeric output of a sigmoid node in a neural network?"
+)
+RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
+RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."
+
+CONVS = [
+    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE1}],
+    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE2}],
+]
+
+
+class TestRewardModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_reward_scores(
+        self,
+        convs,
+        model_path,
+        tp_size,
+        torch_dtype,
+        tolerance,
+    ) -> None:
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="reward",
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(convs)
+
+        with SRTRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="reward",
+        ) as srt_runner:
+            prompts = srt_runner.tokenizer.apply_chat_template(convs, tokenize=False)
+            srt_outputs = srt_runner.forward(prompts)
+
+        hf_scores = torch.tensor(hf_outputs.scores)
+        srt_scores = torch.tensor(srt_outputs.scores)
+        print(f"{hf_scores=}")
+        print(f"{srt_scores=}")
+
+        assert torch.all(
+            abs(hf_scores - srt_scores) < tolerance
+        ), "reward scores are not all close"
+
+    def test_reward_scores(self):
+        for model, tp_size, tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_reward_scores(
+                    CONVS, model, tp_size, torch_dtype, tolerance
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_transformers_models.py
+++ b/test/srt/models/test_transformers_models.py
@@ -0,0 +1,181 @@
+import dataclasses
+import multiprocessing as mp
+import unittest
+from types import SimpleNamespace
+from typing import List
+
+import torch
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+
+class TestTransformersFallbackEndpoint(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--model-impl", "transformers"],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=64,
+            num_threads=32,
+        )
+        from sglang.test.run_eval import run_eval
+
+        metrics = run_eval(args)
+        self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        from sglang.test.few_shot_gsm8k import run_eval
+
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound)
+
+
+class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--model-impl",
+                "transformers",
+                "--torchao-config",
+                "int4wo-128",
+            ],
+        )
+        cls.mmlu_lower_bound = 0.65
+        cls.gsm8k_lower_bound = 0.65
+
+
+@dataclasses.dataclass
+class ModelCase:
+    model_path: str
+    tp_size: int = 1
+    prefill_tolerance: float = 5e-2
+    decode_tolerance: float = 5e-2
+    rouge_l_tolerance: float = 1
+    skip_long_prompt: bool = False
+    trust_remote_code: bool = False
+    torchao_config: str = None
+    torch_dtype: torch.dtype = torch.float16
+
+
+# Popular models that run on the CI
+CI_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST),
+]
+
+ALL_OTHER_MODELS = [
+    ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2),
+]
+
+
+class TestTransformersFallbackEngine(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_logits_and_output_strs(
+        self,
+        prompts: List[str],
+        model_case: ModelCase,
+    ) -> None:
+        model_path = model_case.model_path
+        max_new_tokens = 32
+        # force to use transformers impl
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            model_impl="transformers",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
+
+        with SRTRunner(
+            model_path,
+            tp_size=model_case.tp_size,
+            torch_dtype=model_case.torch_dtype,
+            model_type="generation",
+            trust_remote_code=model_case.trust_remote_code,
+            torchao_config=model_case.torchao_config,
+        ) as srt_runner:
+            srt_transformers_outputs = srt_runner.forward(
+                prompts, max_new_tokens=max_new_tokens
+            )
+
+        check_close_model_outputs(
+            hf_outputs=srt_transformers_outputs,
+            srt_outputs=srt_outputs,
+            prefill_tolerance=model_case.prefill_tolerance,
+            decode_tolerance=model_case.decode_tolerance,
+            rouge_l_tolerance=model_case.rouge_l_tolerance,
+            debug_text=f"model_path={model_path} prompts={prompts}",
+        )
+
+    def test_ci_models(self):
+        for model_case in CI_MODELS:
+            # Skip long prompts for models that do not have a long context
+            prompts = DEFAULT_PROMPTS
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+    def test_others(self):
+        if is_in_ci():
+            return
+
+        # Skip long prompts for models that do not have a long context
+        prompts = DEFAULT_PROMPTS
+        for model_case in ALL_OTHER_MODELS:
+            if model_case.skip_long_prompt:
+                prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
+
+            # Assert the logits and output strs are close
+            self.assert_close_logits_and_output_strs(prompts, model_case)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_unsloth_models.py
+++ b/test/srt/models/test_unsloth_models.py
@@ -0,0 +1,213 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestUnslothPhi4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/phi-4"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.78)
+
+
+class TestUnslothPhi4Bnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/phi-4-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.75)
+
+
+class TestUnslothPhi4UnslothBnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/phi-4-unsloth-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.75)
+
+
+class TestUnslothPhi4MiniInstruct(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Phi-4-mini-instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.65)
+
+
+class TestUnslothPhi4MiniBnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Phi-4-mini-instruct-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.6)
+
+
+class TestUnslothPhi4MiniUnslothBnb4bit(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--load-format",
+                "bitsandbytes",
+            ],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        self.assertGreater(metrics["accuracy"], 0.6)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/models/test_vlm_models.py
+++ b/test/srt/models/test_vlm_models.py
@@ -0,0 +1,315 @@
+import argparse
+import glob
+import json
+import os
+import random
+import subprocess
+import sys
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+)
+
+# VLM models for testing
+MODELS = [
+    SimpleNamespace(model="google/gemma-3-27b-it", mmmu_accuracy=0.45),
+    SimpleNamespace(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        mmmu_accuracy=0.4,
+    ),
+    SimpleNamespace(model="openbmb/MiniCPM-V-2_6", mmmu_accuracy=0.4),
+]
+
+
+class TestVLMModels(CustomTestCase):
+    parsed_args = None  # Class variable to store args
+
+    @classmethod
+    def setUpClass(cls):
+        # Removed argument parsing from here
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.time_out = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+        # Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work.
+        os.environ["OPENAI_API_KEY"] = cls.api_key
+        os.environ["OPENAI_API_BASE"] = f"{cls.base_url}/v1"
+
+    def _detect_eviction_in_logs(self, log_output):
+        """Detect if eviction events occurred in the log output."""
+        eviction_keywords = ["Cache eviction: evicted"]
+
+        eviction_detected = False
+        eviction_count = 0
+
+        for line in log_output.split("\n"):
+            if any(keyword in line for keyword in eviction_keywords):
+                eviction_detected = True
+                eviction_count += 1
+                print(f"Eviction detected: {line.strip()}")
+
+        return eviction_detected, eviction_count
+
+    def run_mmmu_eval(
+        self,
+        model_version: str,
+        output_path: str,
+        *,
+        env: dict | None = None,
+    ):
+        """
+        Evaluate a VLM on the MMMU validation set with lmms‑eval.
+        Only `model_version` (checkpoint) and `chat_template` vary;
+        We are focusing only on the validation set due to resource constraints.
+        """
+        # -------- fixed settings --------
+        model = "openai_compatible"
+        tp = 1
+        tasks = "mmmu_val"
+        batch_size = 2
+        log_suffix = "openai_compatible"
+        os.makedirs(output_path, exist_ok=True)
+
+        # -------- compose --model_args --------
+        model_args = f'model_version="{model_version}",' f"tp={tp}"
+
+        # -------- build command list --------
+        cmd = [
+            "python3",
+            "-m",
+            "lmms_eval",
+            "--model",
+            model,
+            "--model_args",
+            model_args,
+            "--tasks",
+            tasks,
+            "--batch_size",
+            str(batch_size),
+            "--log_samples",
+            "--log_samples_suffix",
+            log_suffix,
+            "--output_path",
+            str(output_path),
+        ]
+
+        subprocess.run(
+            cmd,
+            check=True,
+            timeout=3600,
+        )
+
+    def _run_vlm_mmmu_test(
+        self,
+        model,
+        output_path,
+        test_name="",
+        custom_env=None,
+        log_level="info",
+        capture_output=False,
+    ):
+        """
+        Common method to run VLM MMMU benchmark test.
+
+        Args:
+            model: Model to test
+            output_path: Path for output logs
+            test_name: Optional test name for logging
+            custom_env: Optional custom environment variables
+            log_level: Log level for server (default: "info")
+            capture_output: Whether to capture server stdout/stderr
+        """
+        print(f"\nTesting model: {model.model}{test_name}")
+
+        process = None
+        mmmu_accuracy = 0  # Initialize to handle potential exceptions
+        server_output = ""
+
+        try:
+            # Prepare environment variables
+            process_env = os.environ.copy()
+            if custom_env:
+                process_env.update(custom_env)
+
+            # Prepare stdout/stderr redirection if needed
+            stdout_file = None
+            stderr_file = None
+            if capture_output:
+                stdout_file = open("/tmp/server_stdout.log", "w")
+                stderr_file = open("/tmp/server_stderr.log", "w")
+
+            # Launch server for testing
+            process = popen_launch_server(
+                model.model,
+                base_url=self.base_url,
+                timeout=self.time_out,
+                api_key=self.api_key,
+                other_args=[
+                    "--trust-remote-code",
+                    "--cuda-graph-max-bs",
+                    "32",
+                    "--enable-multimodal",
+                    "--mem-fraction-static",
+                    str(self.parsed_args.mem_fraction_static),  # Use class variable
+                    "--log-level",
+                    log_level,
+                ],
+                env=process_env,
+                return_stdout_stderr=(
+                    (stdout_file, stderr_file) if capture_output else None
+                ),
+            )
+
+            # Run evaluation
+            self.run_mmmu_eval(model.model, output_path)
+
+            # Get the result file
+            result_file_path = glob.glob(f"{output_path}/*.json")[0]
+
+            with open(result_file_path, "r") as f:
+                result = json.load(f)
+                print(f"Result{test_name}\n: {result}")
+
+            # Process the result
+            mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"]
+            print(
+                f"Model {model.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}"
+            )
+
+            # Capture server output if requested
+            if capture_output and process:
+                server_output = self._read_output_from_files()
+
+            # Assert performance meets expected threshold
+            self.assertGreaterEqual(
+                mmmu_accuracy,
+                model.mmmu_accuracy,
+                f"Model {model.model} accuracy ({mmmu_accuracy:.4f}) below expected threshold ({model.mmmu_accuracy:.4f}){test_name}",
+            )
+
+            return server_output
+
+        except Exception as e:
+            print(f"Error testing {model.model}{test_name}: {e}")
+            self.fail(f"Test failed for {model.model}{test_name}: {e}")
+
+        finally:
+            # Ensure process cleanup happens regardless of success/failure
+            if process is not None and process.poll() is None:
+                print(f"Cleaning up process {process.pid}")
+                try:
+                    kill_process_tree(process.pid)
+                except Exception as e:
+                    print(f"Error killing process: {e}")
+
+            # clean up temporary files
+            if capture_output:
+                if stdout_file:
+                    stdout_file.close()
+                if stderr_file:
+                    stderr_file.close()
+                for filename in ["/tmp/server_stdout.log", "/tmp/server_stderr.log"]:
+                    try:
+                        if os.path.exists(filename):
+                            os.remove(filename)
+                    except Exception as e:
+                        print(f"Error removing {filename}: {e}")
+
+    def _read_output_from_files(self):
+        output_lines = []
+
+        log_files = [
+            ("/tmp/server_stdout.log", "[STDOUT]"),
+            ("/tmp/server_stderr.log", "[STDERR]"),
+        ]
+        for filename, tag in log_files:
+            try:
+                if os.path.exists(filename):
+                    with open(filename, "r") as f:
+                        for line in f:
+                            output_lines.append(f"{tag} {line.rstrip()}")
+            except Exception as e:
+                print(f"Error reading {tag.lower()} file: {e}")
+
+        return "\n".join(output_lines)
+
+    def test_vlm_mmmu_benchmark(self):
+        """Test VLM models against MMMU benchmark."""
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model in models_to_test:
+            self._run_vlm_mmmu_test(model, "./logs")
+
+    def test_vlm_mmmu_benchmark_with_small_cache(self):
+        """Test VLM models against MMMU benchmark with a small embedding cache to force eviction."""
+        models_to_test = MODELS
+
+        if is_in_ci():
+            models_to_test = [random.choice(MODELS)]
+
+        for model in models_to_test:
+            custom_env = {"SGLANG_VLM_CACHE_SIZE_MB": "5"}
+
+            # Run the test with output capture
+            server_output = self._run_vlm_mmmu_test(
+                model,
+                "./logs_small_cache",
+                test_name=" with small embedding cache (evict test)",
+                custom_env=custom_env,
+                log_level="debug",  # Enable debug logging for eviction detection
+                capture_output=True,  # Capture server output
+            )
+
+            # Print server output for debugging
+            print("Server output:\n", server_output)
+
+            # Analyze server output for eviction events
+            eviction_detected, eviction_count = self._detect_eviction_in_logs(
+                server_output
+            )
+
+            # Assert that eviction was detected (since we're using small cache)
+            self.assertTrue(
+                eviction_detected,
+                f"Expected eviction events to be detected with small cache (5MB), but none found. "
+                f"Cache size may be too large for the workload or eviction logic may not be working. "
+                f"Total log content length: {len(server_output)} characters",
+            )
+
+            print(
+                f"Eviction detection summary: {eviction_count} eviction events detected"
+            )
+
+            # Additional assertion: if eviction was detected, the test passed
+            if eviction_detected:
+                print("✅ Eviction logic successfully triggered and detected!")
+
+
+if __name__ == "__main__":
+    # Define and parse arguments here, before unittest.main
+    parser = argparse.ArgumentParser(description="Test VLM models")
+    parser.add_argument(
+        "--mem-fraction-static",
+        type=float,
+        help="Static memory fraction for the model",
+        default=0.8,
+    )
+
+    # Parse args intended for unittest
+    args = parser.parse_args()
+
+    # Store the parsed args object on the class
+    TestVLMModels.parsed_args = args
+
+    # Pass args to unittest
+    unittest.main(argv=[sys.argv[0]])