sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

This commit is contained in:
maxiao1
2025-09-13 17:00:20 +08:00
commit 118f1fc726
2037 changed files with 515371 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
"""
used for debug using tensor comparison
dump {name: tensor} into "log_hf.jsonl" and "log_srt.jsonl"
use the same name for two tensors that supposed to be close
recommend name like: "layer 2 after mlp"
"""
import json
import sys
import torch
if len(sys.argv) > 1:
assert sys.argv[1] == "base"
hf_log = "base_log_hf.jsonl"
srt_log = "base_log_srt.jsonl"
else:
hf_log = "log_hf.jsonl"
srt_log = "log_srt.jsonl"
def load_data(filepath):
tensors = {}
with open(filepath, "r") as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
for k, v in data.items():
tensors[k] = torch.tensor(v)
return tensors
hf_tensors = load_data(hf_log)
srt_tensors = load_data(srt_log)
def get_diff(t1, t2):
t1 = t1.reshape(t2.shape)
max_diff = torch.max(abs(t1.reshape(t2.shape) - t2))
l2_dis = torch.dist(t1, t2, p=2)
return l2_dis, max_diff
for k, _ in srt_tensors.items():
l2_dis, max_diff = get_diff(hf_tensors[k], srt_tensors[k])
print(f"{k} {l2_dis=} {max_diff=}")
if k == "layer 1 attn":
print(hf_tensors[k])
print(srt_tensors[k])
if k == "layer 0 prefill k":
print(srt_tensors[k].shape)
print(hf_tensors[k].shape)

View File

@@ -0,0 +1,80 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import multiprocessing as mp
import unittest
import torch
from transformers import AutoProcessor
from sglang.srt.utils import load_image
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
from sglang.test.test_utils import get_similarities
TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
MODELS = [
("openai/clip-vit-large-patch14-336", 1e-5),
]
TORCH_DTYPES = [torch.float16]
class TestClipModels(unittest.TestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
with HFRunner(
model,
torch_dtype=torch_dtype,
model_type="embedding",
) as hf_runner:
hf_text_embeds = hf_runner.forward(prompts=TEXTS)
hf_image_embeds = hf_runner.forward(image_data=IMAGES)
with SRTRunner(
model,
tp_size=1,
torch_dtype=torch_dtype,
model_type="embedding",
) as srt_runner:
text_embeds = srt_runner.forward(prompts=TEXTS)
image_embeds = srt_runner.forward(prompts="padding", image_data=IMAGES)
text_similarity = get_similarities(
text_embeds.embed_logits[0], hf_text_embeds.embed_logits[0]
)
image_similarity = get_similarities(
image_embeds.embed_logits[0], hf_image_embeds.embed_logits[0]
)
print("text similarity diff", abs(text_similarity - 1))
print("image similarity diff", abs(image_similarity - 1))
assert torch.all(
abs(text_similarity - 1) < prefill_tolerance
), "embeddings are not all close"
assert torch.all(
abs(image_similarity - 1) < prefill_tolerance
), "embeddings are not all close"
def test_accuracy(self):
for model, prefill_tolerance in MODELS:
for torch_dtype in TORCH_DTYPES:
self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,46 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestCompressedTensorsLlama3FP8(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "RedHatAI/Meta-Llama-3.1-8B-FP8"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreaterEqual(metrics["accuracy"], 0.45)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,91 @@
import multiprocessing as mp
import random
import unittest
import torch
from transformers import AutoConfig, AutoTokenizer
from sglang.test.runners import TEST_RERANK_QUERY_DOCS, HFRunner, SRTRunner
from sglang.test.test_utils import CustomTestCase, is_in_ci
MODELS = [
("cross-encoder/ms-marco-MiniLM-L6-v2", 1, 1e-2),
("BAAI/bge-reranker-v2-m3", 1, 1e-2),
]
ATTENTION_BACKEND = ["torch_native", "triton"]
TORCH_DTYPES = [torch.float32]
class TestCrossEncoderModels(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def assert_close_prefill_logits(
self,
prompts,
model_path,
tp_size,
torch_dtype,
score_tolerance,
attention_backend,
) -> None:
with HFRunner(
model_path,
torch_dtype=torch_dtype,
model_type="cross_encoder",
) as hf_runner:
hf_scores = hf_runner.forward(prompts).scores
with SRTRunner(
model_path,
tp_size=tp_size,
torch_dtype=torch_dtype,
model_type="cross_encoder",
attention_backend=attention_backend,
chunked_prefill_size=-1,
disable_radix_cache=True,
) as srt_runner:
srt_scores = srt_runner.forward(prompts).scores
for i in range(len(srt_scores)):
score_difference = abs(hf_scores[i] - srt_scores[i])
assert (
score_difference < score_tolerance
), "cross encoder scores are not all close"
def preprocess_prompts(self, prompt):
processed_prompts = []
query = prompt["query"]
documents = prompt["documents"]
for document in documents:
processed_prompts.append([query, document])
return processed_prompts
def test_prefill_logits(self):
models_to_test = MODELS
if is_in_ci():
models_to_test = [random.choice(MODELS)]
for model, tp_size, prefill_tolerance in models_to_test:
for attention_backend in ATTENTION_BACKEND:
for queryDocs in TEST_RERANK_QUERY_DOCS:
prompts = self.preprocess_prompts(queryDocs)
for torch_dtype in TORCH_DTYPES:
self.assert_close_prefill_logits(
prompts,
model,
tp_size,
torch_dtype,
prefill_tolerance,
attention_backend,
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,34 @@
import unittest
from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch
class TestDummyGrok1(CustomTestCase):
def test_dummy_grok_1(self):
_, output_throughput, _ = run_bench_one_batch(
None,
[
"--model",
"/dummy-grok",
"--tokenizer-path",
"Xenova/grok-1-tokenizer",
"--batch-size",
"2",
"--tp",
"2",
"--quantization",
"fp8",
"--load-format",
"dummy",
"--json-model-override-args",
'{"num_hidden_layers": 2}',
],
)
if is_in_ci():
self.assertGreater(output_throughput, 0)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,111 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import multiprocessing as mp
import random
import unittest
import torch
from transformers import AutoConfig, AutoTokenizer
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
MODELS = [
("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5),
("intfloat/e5-mistral-7b-instruct", 1, 1e-5),
("marco/mcdse-2b-v1", 1, 1e-5),
("Qwen/Qwen3-Embedding-8B", 1, 1e-5),
# Temporarily disable before this model is fixed
# ("jason9693/Qwen2.5-1.5B-apeach", 1, 1e-5),
]
TORCH_DTYPES = [torch.float16]
class TestEmbeddingModels(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def _truncate_prompts(self, prompts, model_path):
config = AutoConfig.from_pretrained(model_path)
max_length = getattr(config, "max_position_embeddings", 2048)
tokenizer = AutoTokenizer.from_pretrained(model_path)
truncated_prompts = []
for prompt in prompts:
tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
if len(tokens.input_ids[0]) > max_length:
truncated_text = tokenizer.decode(
tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
)
truncated_prompts.append(truncated_text)
else:
truncated_prompts.append(prompt)
return truncated_prompts
def assert_close_prefill_logits(
self,
prompts,
model_path,
tp_size,
torch_dtype,
prefill_tolerance,
) -> None:
truncated_prompts = self._truncate_prompts(prompts, model_path)
with HFRunner(
model_path,
torch_dtype=torch_dtype,
model_type="embedding",
) as hf_runner:
hf_outputs = hf_runner.forward(truncated_prompts)
with SRTRunner(
model_path,
tp_size=tp_size,
torch_dtype=torch_dtype,
model_type="embedding",
) as srt_runner:
srt_outputs = srt_runner.forward(truncated_prompts)
for i in range(len(prompts)):
hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
print("similarity diff", abs(similarity - 1))
if len(prompts[i]) <= 1000:
assert torch.all(
abs(similarity - 1) < prefill_tolerance
), "embeddings are not all close"
def test_prefill_logits(self):
models_to_test = MODELS
if is_in_ci():
models_to_test = [random.choice(MODELS)]
for model, tp_size, prefill_tolerance in models_to_test:
for torch_dtype in TORCH_DTYPES:
self.assert_close_prefill_logits(
DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,162 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# python -m unittest test_encoder_embedding_models.TestEncoderEmbeddingModels.test_prefill_logits
import multiprocessing as mp
import random
import time
import unittest
import torch
from transformers import AutoConfig, AutoTokenizer
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
MODELS = [("BAAI/bge-small-en", 1, 1e-5), ("BAAI/bge-m3", 1, 1e-5)]
ATTENTION_BACKEND = ["torch_native", "triton", "flashinfer"]
BATCH_SIZE = [1, 2]
TORCH_DTYPES = [torch.float32, torch.float16]
sgl_to_st_ratio = []
class TestEncoderEmbeddingModels(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def _truncate_prompts(self, prompts, model_path):
config = AutoConfig.from_pretrained(model_path)
max_length = getattr(config, "max_position_embeddings", 512) - 20
tokenizer = AutoTokenizer.from_pretrained(model_path)
truncated_prompts = []
for prompt in prompts:
tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
if len(tokens.input_ids[0]) > max_length:
truncated_text = tokenizer.decode(
tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
)
truncated_prompts.append(truncated_text)
else:
truncated_prompts.append(prompt)
return truncated_prompts
def assert_close_prefill_logits(
self,
prompts,
model_path,
tp_size,
torch_dtype,
prefill_tolerance,
attention_backend,
batch_size,
) -> None:
truncated_prompts = self._truncate_prompts(prompts, model_path)
truncated_prompts = truncated_prompts * batch_size
with HFRunner(
model_path,
torch_dtype=torch_dtype,
model_type="embedding",
) as hf_runner:
# warm up
hf_outputs = hf_runner.forward(truncated_prompts)
st_start_time = time.perf_counter()
hf_outputs = hf_runner.forward(truncated_prompts)
st_end_time = time.perf_counter()
with SRTRunner(
model_path,
tp_size=tp_size,
torch_dtype=torch_dtype,
model_type="embedding",
attention_backend=attention_backend,
chunked_prefill_size=-1,
disable_radix_cache=True,
) as srt_runner:
# warm up
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_start_time = time.perf_counter()
srt_outputs = srt_runner.forward(truncated_prompts)
sgl_end_time = time.perf_counter()
transformer_time = st_end_time - st_start_time
sgl_time = sgl_end_time - sgl_start_time
sgl_to_st_ratio.append(sgl_time / transformer_time)
for i in range(len(truncated_prompts)):
hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
# If something is wrong, uncomment this to observe similarity.
# print("similarity diff", abs(similarity - 1))
if len(truncated_prompts[i]) <= 1000:
assert torch.all(
abs(similarity - 1) < prefill_tolerance
), "embeddings are not all close"
def test_prefill_logits(self):
models_to_test = MODELS
if is_in_ci():
models_to_test = [random.choice(MODELS)]
for model, tp_size, prefill_tolerance in models_to_test:
for attention_backend in ATTENTION_BACKEND:
for batch_size in BATCH_SIZE:
for torch_dtype in TORCH_DTYPES:
# NOTE: FlashInfer currently has limitations with head_dim = 32 or
# other dimensions.
# The FlashInfer head_dim limitation itself is tracked here:
# https://github.com/flashinfer-ai/flashinfer/issues/1048
#
# Flashinfer does not support torch.float32 for dtype_q, so skip it
if attention_backend == "flashinfer":
if (
model == "BAAI/bge-small-en"
or torch_dtype == torch.float32
):
continue
self.assert_close_prefill_logits(
DEFAULT_PROMPTS,
model,
tp_size,
torch_dtype,
prefill_tolerance,
attention_backend,
batch_size,
)
for i in range(len(BATCH_SIZE)):
print(
"bacth size: ",
BATCH_SIZE[i] * 5,
"sgl_time/st_time",
round(sgl_to_st_ratio[i], 3),
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,181 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Usage:
To test a specific model locally:
1. Add it to ALL_MODELS, for example, `ModelCase("Qwen/Qwen2-1.5B")`
2. Run `ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels`
"""
import dataclasses
import multiprocessing as mp
import os
import random
import unittest
from typing import List
import torch
from sglang.test.runners import (
DEFAULT_PROMPTS,
HFRunner,
SRTRunner,
check_close_model_outputs,
)
from sglang.test.test_utils import CustomTestCase, is_in_ci
@dataclasses.dataclass
class ModelCase:
model_path: str
tp_size: int = 1
prefill_tolerance: float = 5e-2
decode_tolerance: float = 6e-2 # Increased to fix numerical error in issue #8614.
rouge_l_tolerance: float = 1
skip_long_prompt: bool = False
trust_remote_code: bool = False
# Popular models that run on the CI
CI_MODELS = [
ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
ModelCase("google/gemma-2-2b"),
]
# the complete set of models to test sglang's generation model
ALL_MODELS = [
*CI_MODELS,
ModelCase("Qwen/Qwen2-1.5B"),
ModelCase("Qwen/Qwen2.5-14B-Instruct"),
ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
ModelCase(
"THUDM/glm-4-9b-chat", tp_size=2, trust_remote_code=True, skip_long_prompt=True
),
ModelCase("openai-community/gpt2"),
ModelCase("microsoft/phi-1_5", trust_remote_code=True),
ModelCase("adept/persimmon-8b-chat"),
ModelCase("inclusionAI/Ling-lite", trust_remote_code=True),
ModelCase("microsoft/Phi-3-small-8k-instruct", trust_remote_code=True),
ModelCase("allenai/OLMo-2-1124-7B-Instruct", skip_long_prompt=True),
ModelCase("ibm-granite/granite-3.0-2b-instruct", skip_long_prompt=True),
ModelCase(
"microsoft/Phi-3.5-MoE-instruct",
tp_size=2,
trust_remote_code=True,
skip_long_prompt=True,
),
ModelCase("facebook/opt-125m", skip_long_prompt=True),
ModelCase(
"nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
tp_size=2,
trust_remote_code=True,
skip_long_prompt=True,
),
ModelCase(
"nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
tp_size=8,
trust_remote_code=True,
skip_long_prompt=True,
),
]
TORCH_DTYPES = [torch.float16]
class TestGenerationModels(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def assert_close_logits_and_output_strs(
self,
prompts: List[str],
model_case: ModelCase,
torch_dtype: torch.dtype,
) -> None:
model_path = model_case.model_path
prefill_tolerance, decode_tolerance, rouge_l_tolerance = (
model_case.prefill_tolerance,
model_case.decode_tolerance,
model_case.rouge_l_tolerance,
)
max_new_tokens = 32
with HFRunner(
model_path,
torch_dtype=torch_dtype,
model_type="generation",
trust_remote_code=model_case.trust_remote_code,
) as hf_runner:
hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)
with SRTRunner(
model_path,
tp_size=model_case.tp_size,
torch_dtype=torch_dtype,
model_type="generation",
trust_remote_code=model_case.trust_remote_code,
) as srt_runner:
srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
check_close_model_outputs(
hf_outputs=hf_outputs,
srt_outputs=srt_outputs,
prefill_tolerance=model_case.prefill_tolerance,
decode_tolerance=model_case.decode_tolerance,
rouge_l_tolerance=model_case.rouge_l_tolerance,
debug_text=f"model_path={model_path} prompts={prompts}",
)
@unittest.skipIf(not is_in_ci(), "Local test should run all models")
def test_ci_models(self):
for model_case in CI_MODELS:
for torch_dtype in TORCH_DTYPES:
prompts = DEFAULT_PROMPTS
# Skip long prompts for models that do not have a long context
if model_case.skip_long_prompt:
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
# Assert the logits and output strs are close
self.assert_close_logits_and_output_strs(
prompts, model_case, torch_dtype
)
@unittest.skipIf(is_in_ci(), "CI only runs selected models for simplicity")
def test_all_models(self):
for model_case in ALL_MODELS:
for torch_dtype in TORCH_DTYPES:
if (
"ONLY_RUN" in os.environ
and os.environ["ONLY_RUN"] != model_case.model_path
):
continue
# Skip long prompts for models that do not have a long context
prompts = DEFAULT_PROMPTS
if model_case.skip_long_prompt:
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
# Assert the logits and output strs are close
self.assert_close_logits_and_output_strs(
prompts, model_case, torch_dtype
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,85 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import multiprocessing as mp
import unittest
import torch
from sglang.test.runners import HFRunner, SRTRunner
from sglang.test.test_utils import CustomTestCase, get_similarities
TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
MODELS = [
("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", 1e-3),
]
TORCH_DTYPES = [torch.float16]
class TestQmeQwenModels(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
prompts_no_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{TEXTS}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
prompts_with_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
with HFRunner(
model,
torch_dtype=torch_dtype,
model_type="embedding",
) as hf_runner:
hf_text_embeddings = hf_runner.forward(prompts=[prompts_no_image])
hf_image_embeddings = hf_runner.forward(
prompts=[prompts_with_image], image_data=[IMAGES]
)
with SRTRunner(
model,
tp_size=1,
torch_dtype=torch_dtype,
model_type="embedding",
) as srt_runner:
srt_text_embeddings = srt_runner.forward(prompts=prompts_no_image)
srt_image_embeddings = srt_runner.forward(
prompts=prompts_with_image, image_data=IMAGES
)
similarity = get_similarities(
hf_text_embeddings.embed_logits[0], srt_text_embeddings.embed_logits[0]
)
print("texts similarity diff", abs(similarity - 1))
assert torch.all(
abs(similarity - 1) < prefill_tolerance
), "embeddings are not all close"
similarity = get_similarities(
hf_image_embeddings.embed_logits[0], srt_image_embeddings.embed_logits[0]
)
print("images similarity diff", abs(similarity - 1))
assert torch.all(
abs(similarity - 1) < prefill_tolerance
), "embeddings are not all close"
def test_accuracy(self):
for model, prefill_tolerance in MODELS:
for torch_dtype in TORCH_DTYPES:
self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,53 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestGrok(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "lmzheng/grok-1"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--load-format",
"dummy",
"--json-model-override-args",
'{"num_hidden_layers": 2}',
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=64,
max_new_tokens=256,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
# It is dummy weights so we only assert the output throughput instead of accuracy.
self.assertGreater(metrics["output_throughput"], 1000)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,74 @@
import random
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
MODELS = [
SimpleNamespace(
model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
accuracy=0.9,
tp_size=4,
),
]
class TestLlama4(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST
def test_gsm8k(self):
for model in MODELS:
try:
process = popen_launch_server(
model.model,
self.base_url,
timeout=3 * DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--chat-template",
"llama-4",
"--tp-size",
str(model.tp_size),
"--mem-fraction-static",
"0.8",
"--context-length",
"8192",
],
)
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreaterEqual(metrics["accuracy"], model.accuracy)
except Exception as e:
print(f"Error testing {model.model}: {e}")
self.fail(f"Test failed for {model.model}: {e}")
finally:
# Ensure process cleanup happens regardless of success/failure
if process is not None and process.poll() is None:
print(f"Cleaning up process {process.pid}")
try:
kill_process_tree(process.pid)
except Exception as e:
print(f"Error killing process: {e}")
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,58 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestMiMoMTP(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "XiaomiMiMo/MiMo-7B-RL"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--trust-remote-code",
"--speculative-algorithm",
"EAGLE",
"--speculative-num-steps",
"1",
"--speculative-eagle-topk",
"1",
"--speculative-num-draft-tokens",
"2",
"--mem-fraction-static",
"0.5",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.7)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,77 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestQwen2(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "Qwen/Qwen2-7B-Instruct"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.78)
class TestQwen2FP8(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "neuralmagic/Qwen2-7B-Instruct-FP8"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.78)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,92 @@
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import multiprocessing as mp
import unittest
import torch
from sglang.test.runners import HFRunner, SRTRunner
from sglang.test.test_utils import CustomTestCase
MODELS = [
("LxzGordon/URM-LLaMa-3.1-8B", 1, 4e-2),
("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", 1, 4e-2),
]
TORCH_DTYPES = [torch.float16]
# PROMPT = "Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits all her apples equally among herself and her 2 siblings. How many apples does each person get?"
# RESPONSE1 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among herself and her 2 siblings (3 people in total). 9 ÷ 3 = 3 apples each. Each person gets 3 apples."
# RESPONSE2 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among her 2 siblings (2 people in total). 9 ÷ 2 = 4.5 apples each. Each person gets 4 apples."
PROMPT = (
"What is the range of the numeric output of a sigmoid node in a neural network?"
)
RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."
CONVS = [
[{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE1}],
[{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE2}],
]
class TestRewardModels(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def assert_close_reward_scores(
self,
convs,
model_path,
tp_size,
torch_dtype,
tolerance,
) -> None:
with HFRunner(
model_path,
torch_dtype=torch_dtype,
model_type="reward",
) as hf_runner:
hf_outputs = hf_runner.forward(convs)
with SRTRunner(
model_path,
torch_dtype=torch_dtype,
model_type="reward",
) as srt_runner:
prompts = srt_runner.tokenizer.apply_chat_template(convs, tokenize=False)
srt_outputs = srt_runner.forward(prompts)
hf_scores = torch.tensor(hf_outputs.scores)
srt_scores = torch.tensor(srt_outputs.scores)
print(f"{hf_scores=}")
print(f"{srt_scores=}")
assert torch.all(
abs(hf_scores - srt_scores) < tolerance
), "reward scores are not all close"
def test_reward_scores(self):
for model, tp_size, tolerance in MODELS:
for torch_dtype in TORCH_DTYPES:
self.assert_close_reward_scores(
CONVS, model, tp_size, torch_dtype, tolerance
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,181 @@
import dataclasses
import multiprocessing as mp
import unittest
from types import SimpleNamespace
from typing import List
import torch
from sglang.srt.utils import kill_process_tree
from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
)
class TestTransformersFallbackEndpoint(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--model-impl", "transformers"],
)
cls.mmlu_lower_bound = 0.65
cls.gsm8k_lower_bound = 0.65
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
)
from sglang.test.run_eval import run_eval
metrics = run_eval(args)
self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
from sglang.test.few_shot_gsm8k import run_eval
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound)
class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--model-impl",
"transformers",
"--torchao-config",
"int4wo-128",
],
)
cls.mmlu_lower_bound = 0.65
cls.gsm8k_lower_bound = 0.65
@dataclasses.dataclass
class ModelCase:
model_path: str
tp_size: int = 1
prefill_tolerance: float = 5e-2
decode_tolerance: float = 5e-2
rouge_l_tolerance: float = 1
skip_long_prompt: bool = False
trust_remote_code: bool = False
torchao_config: str = None
torch_dtype: torch.dtype = torch.float16
# Popular models that run on the CI
CI_MODELS = [
ModelCase(DEFAULT_MODEL_NAME_FOR_TEST),
]
ALL_OTHER_MODELS = [
ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2),
]
class TestTransformersFallbackEngine(CustomTestCase):
@classmethod
def setUpClass(cls):
mp.set_start_method("spawn", force=True)
def assert_close_logits_and_output_strs(
self,
prompts: List[str],
model_case: ModelCase,
) -> None:
model_path = model_case.model_path
max_new_tokens = 32
# force to use transformers impl
with SRTRunner(
model_path,
tp_size=model_case.tp_size,
torch_dtype=model_case.torch_dtype,
model_type="generation",
model_impl="transformers",
trust_remote_code=model_case.trust_remote_code,
torchao_config=model_case.torchao_config,
) as srt_runner:
srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
with SRTRunner(
model_path,
tp_size=model_case.tp_size,
torch_dtype=model_case.torch_dtype,
model_type="generation",
trust_remote_code=model_case.trust_remote_code,
torchao_config=model_case.torchao_config,
) as srt_runner:
srt_transformers_outputs = srt_runner.forward(
prompts, max_new_tokens=max_new_tokens
)
check_close_model_outputs(
hf_outputs=srt_transformers_outputs,
srt_outputs=srt_outputs,
prefill_tolerance=model_case.prefill_tolerance,
decode_tolerance=model_case.decode_tolerance,
rouge_l_tolerance=model_case.rouge_l_tolerance,
debug_text=f"model_path={model_path} prompts={prompts}",
)
def test_ci_models(self):
for model_case in CI_MODELS:
# Skip long prompts for models that do not have a long context
prompts = DEFAULT_PROMPTS
if model_case.skip_long_prompt:
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
# Assert the logits and output strs are close
self.assert_close_logits_and_output_strs(prompts, model_case)
def test_others(self):
if is_in_ci():
return
# Skip long prompts for models that do not have a long context
prompts = DEFAULT_PROMPTS
for model_case in ALL_OTHER_MODELS:
if model_case.skip_long_prompt:
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
# Assert the logits and output strs are close
self.assert_close_logits_and_output_strs(prompts, model_case)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,213 @@
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestUnslothPhi4(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "unsloth/phi-4"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.78)
class TestUnslothPhi4Bnb4bit(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "unsloth/phi-4-bnb-4bit"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--load-format",
"bitsandbytes",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.75)
class TestUnslothPhi4UnslothBnb4bit(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "unsloth/phi-4-unsloth-bnb-4bit"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--load-format",
"bitsandbytes",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.75)
class TestUnslothPhi4MiniInstruct(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "unsloth/Phi-4-mini-instruct"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.65)
class TestUnslothPhi4MiniBnb4bit(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "unsloth/Phi-4-mini-instruct-bnb-4bit"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--load-format",
"bitsandbytes",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.6)
class TestUnslothPhi4MiniUnslothBnb4bit(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--load-format",
"bitsandbytes",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.6)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,315 @@
import argparse
import glob
import json
import os
import random
import subprocess
import sys
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
)
# VLM models for testing
MODELS = [
SimpleNamespace(model="google/gemma-3-27b-it", mmmu_accuracy=0.45),
SimpleNamespace(
model="Qwen/Qwen2.5-VL-3B-Instruct",
mmmu_accuracy=0.4,
),
SimpleNamespace(model="openbmb/MiniCPM-V-2_6", mmmu_accuracy=0.4),
]
class TestVLMModels(CustomTestCase):
parsed_args = None # Class variable to store args
@classmethod
def setUpClass(cls):
# Removed argument parsing from here
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.time_out = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
# Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work.
os.environ["OPENAI_API_KEY"] = cls.api_key
os.environ["OPENAI_API_BASE"] = f"{cls.base_url}/v1"
def _detect_eviction_in_logs(self, log_output):
"""Detect if eviction events occurred in the log output."""
eviction_keywords = ["Cache eviction: evicted"]
eviction_detected = False
eviction_count = 0
for line in log_output.split("\n"):
if any(keyword in line for keyword in eviction_keywords):
eviction_detected = True
eviction_count += 1
print(f"Eviction detected: {line.strip()}")
return eviction_detected, eviction_count
def run_mmmu_eval(
self,
model_version: str,
output_path: str,
*,
env: dict | None = None,
):
"""
Evaluate a VLM on the MMMU validation set with lmmseval.
Only `model_version` (checkpoint) and `chat_template` vary;
We are focusing only on the validation set due to resource constraints.
"""
# -------- fixed settings --------
model = "openai_compatible"
tp = 1
tasks = "mmmu_val"
batch_size = 2
log_suffix = "openai_compatible"
os.makedirs(output_path, exist_ok=True)
# -------- compose --model_args --------
model_args = f'model_version="{model_version}",' f"tp={tp}"
# -------- build command list --------
cmd = [
"python3",
"-m",
"lmms_eval",
"--model",
model,
"--model_args",
model_args,
"--tasks",
tasks,
"--batch_size",
str(batch_size),
"--log_samples",
"--log_samples_suffix",
log_suffix,
"--output_path",
str(output_path),
]
subprocess.run(
cmd,
check=True,
timeout=3600,
)
def _run_vlm_mmmu_test(
self,
model,
output_path,
test_name="",
custom_env=None,
log_level="info",
capture_output=False,
):
"""
Common method to run VLM MMMU benchmark test.
Args:
model: Model to test
output_path: Path for output logs
test_name: Optional test name for logging
custom_env: Optional custom environment variables
log_level: Log level for server (default: "info")
capture_output: Whether to capture server stdout/stderr
"""
print(f"\nTesting model: {model.model}{test_name}")
process = None
mmmu_accuracy = 0 # Initialize to handle potential exceptions
server_output = ""
try:
# Prepare environment variables
process_env = os.environ.copy()
if custom_env:
process_env.update(custom_env)
# Prepare stdout/stderr redirection if needed
stdout_file = None
stderr_file = None
if capture_output:
stdout_file = open("/tmp/server_stdout.log", "w")
stderr_file = open("/tmp/server_stderr.log", "w")
# Launch server for testing
process = popen_launch_server(
model.model,
base_url=self.base_url,
timeout=self.time_out,
api_key=self.api_key,
other_args=[
"--trust-remote-code",
"--cuda-graph-max-bs",
"32",
"--enable-multimodal",
"--mem-fraction-static",
str(self.parsed_args.mem_fraction_static), # Use class variable
"--log-level",
log_level,
],
env=process_env,
return_stdout_stderr=(
(stdout_file, stderr_file) if capture_output else None
),
)
# Run evaluation
self.run_mmmu_eval(model.model, output_path)
# Get the result file
result_file_path = glob.glob(f"{output_path}/*.json")[0]
with open(result_file_path, "r") as f:
result = json.load(f)
print(f"Result{test_name}\n: {result}")
# Process the result
mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"]
print(
f"Model {model.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}"
)
# Capture server output if requested
if capture_output and process:
server_output = self._read_output_from_files()
# Assert performance meets expected threshold
self.assertGreaterEqual(
mmmu_accuracy,
model.mmmu_accuracy,
f"Model {model.model} accuracy ({mmmu_accuracy:.4f}) below expected threshold ({model.mmmu_accuracy:.4f}){test_name}",
)
return server_output
except Exception as e:
print(f"Error testing {model.model}{test_name}: {e}")
self.fail(f"Test failed for {model.model}{test_name}: {e}")
finally:
# Ensure process cleanup happens regardless of success/failure
if process is not None and process.poll() is None:
print(f"Cleaning up process {process.pid}")
try:
kill_process_tree(process.pid)
except Exception as e:
print(f"Error killing process: {e}")
# clean up temporary files
if capture_output:
if stdout_file:
stdout_file.close()
if stderr_file:
stderr_file.close()
for filename in ["/tmp/server_stdout.log", "/tmp/server_stderr.log"]:
try:
if os.path.exists(filename):
os.remove(filename)
except Exception as e:
print(f"Error removing {filename}: {e}")
def _read_output_from_files(self):
output_lines = []
log_files = [
("/tmp/server_stdout.log", "[STDOUT]"),
("/tmp/server_stderr.log", "[STDERR]"),
]
for filename, tag in log_files:
try:
if os.path.exists(filename):
with open(filename, "r") as f:
for line in f:
output_lines.append(f"{tag} {line.rstrip()}")
except Exception as e:
print(f"Error reading {tag.lower()} file: {e}")
return "\n".join(output_lines)
def test_vlm_mmmu_benchmark(self):
"""Test VLM models against MMMU benchmark."""
models_to_test = MODELS
if is_in_ci():
models_to_test = [random.choice(MODELS)]
for model in models_to_test:
self._run_vlm_mmmu_test(model, "./logs")
def test_vlm_mmmu_benchmark_with_small_cache(self):
"""Test VLM models against MMMU benchmark with a small embedding cache to force eviction."""
models_to_test = MODELS
if is_in_ci():
models_to_test = [random.choice(MODELS)]
for model in models_to_test:
custom_env = {"SGLANG_VLM_CACHE_SIZE_MB": "5"}
# Run the test with output capture
server_output = self._run_vlm_mmmu_test(
model,
"./logs_small_cache",
test_name=" with small embedding cache (evict test)",
custom_env=custom_env,
log_level="debug", # Enable debug logging for eviction detection
capture_output=True, # Capture server output
)
# Print server output for debugging
print("Server output:\n", server_output)
# Analyze server output for eviction events
eviction_detected, eviction_count = self._detect_eviction_in_logs(
server_output
)
# Assert that eviction was detected (since we're using small cache)
self.assertTrue(
eviction_detected,
f"Expected eviction events to be detected with small cache (5MB), but none found. "
f"Cache size may be too large for the workload or eviction logic may not be working. "
f"Total log content length: {len(server_output)} characters",
)
print(
f"Eviction detection summary: {eviction_count} eviction events detected"
)
# Additional assertion: if eviction was detected, the test passed
if eviction_detected:
print("✅ Eviction logic successfully triggered and detected!")
if __name__ == "__main__":
# Define and parse arguments here, before unittest.main
parser = argparse.ArgumentParser(description="Test VLM models")
parser.add_argument(
"--mem-fraction-static",
type=float,
help="Static memory fraction for the model",
default=0.8,
)
# Parse args intended for unittest
args = parser.parse_args()
# Store the parsed args object on the class
TestVLMModels.parsed_args = args
# Pass args to unittest
unittest.main(argv=[sys.argv[0]])