sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct
This commit is contained in:
52
test/srt/models/compare.py
Normal file
52
test/srt/models/compare.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
used for debug using tensor comparison
|
||||
dump {name: tensor} into "log_hf.jsonl" and "log_srt.jsonl"
|
||||
use the same name for two tensors that supposed to be close
|
||||
recommend name like: "layer 2 after mlp"
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
assert sys.argv[1] == "base"
|
||||
hf_log = "base_log_hf.jsonl"
|
||||
srt_log = "base_log_srt.jsonl"
|
||||
else:
|
||||
hf_log = "log_hf.jsonl"
|
||||
srt_log = "log_srt.jsonl"
|
||||
|
||||
|
||||
def load_data(filepath):
|
||||
tensors = {}
|
||||
with open(filepath, "r") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
data = json.loads(line)
|
||||
for k, v in data.items():
|
||||
tensors[k] = torch.tensor(v)
|
||||
return tensors
|
||||
|
||||
|
||||
hf_tensors = load_data(hf_log)
|
||||
srt_tensors = load_data(srt_log)
|
||||
|
||||
|
||||
def get_diff(t1, t2):
|
||||
t1 = t1.reshape(t2.shape)
|
||||
max_diff = torch.max(abs(t1.reshape(t2.shape) - t2))
|
||||
l2_dis = torch.dist(t1, t2, p=2)
|
||||
return l2_dis, max_diff
|
||||
|
||||
|
||||
for k, _ in srt_tensors.items():
|
||||
l2_dis, max_diff = get_diff(hf_tensors[k], srt_tensors[k])
|
||||
print(f"{k} {l2_dis=} {max_diff=}")
|
||||
if k == "layer 1 attn":
|
||||
print(hf_tensors[k])
|
||||
print(srt_tensors[k])
|
||||
if k == "layer 0 prefill k":
|
||||
print(srt_tensors[k].shape)
|
||||
print(hf_tensors[k].shape)
|
||||
80
test/srt/models/test_clip_models.py
Normal file
80
test/srt/models/test_clip_models.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import multiprocessing as mp
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from sglang.srt.utils import load_image
|
||||
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import get_similarities
|
||||
|
||||
TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
|
||||
IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
|
||||
MODELS = [
|
||||
("openai/clip-vit-large-patch14-336", 1e-5),
|
||||
]
|
||||
TORCH_DTYPES = [torch.float16]
|
||||
|
||||
|
||||
class TestClipModels(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
|
||||
|
||||
with HFRunner(
|
||||
model,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as hf_runner:
|
||||
hf_text_embeds = hf_runner.forward(prompts=TEXTS)
|
||||
hf_image_embeds = hf_runner.forward(image_data=IMAGES)
|
||||
|
||||
with SRTRunner(
|
||||
model,
|
||||
tp_size=1,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as srt_runner:
|
||||
text_embeds = srt_runner.forward(prompts=TEXTS)
|
||||
image_embeds = srt_runner.forward(prompts="padding", image_data=IMAGES)
|
||||
|
||||
text_similarity = get_similarities(
|
||||
text_embeds.embed_logits[0], hf_text_embeds.embed_logits[0]
|
||||
)
|
||||
image_similarity = get_similarities(
|
||||
image_embeds.embed_logits[0], hf_image_embeds.embed_logits[0]
|
||||
)
|
||||
print("text similarity diff", abs(text_similarity - 1))
|
||||
print("image similarity diff", abs(image_similarity - 1))
|
||||
assert torch.all(
|
||||
abs(text_similarity - 1) < prefill_tolerance
|
||||
), "embeddings are not all close"
|
||||
assert torch.all(
|
||||
abs(image_similarity - 1) < prefill_tolerance
|
||||
), "embeddings are not all close"
|
||||
|
||||
def test_accuracy(self):
|
||||
for model, prefill_tolerance in MODELS:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
46
test/srt/models/test_compressed_tensors_models.py
Normal file
46
test/srt/models/test_compressed_tensors_models.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestCompressedTensorsLlama3FP8(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "RedHatAI/Meta-Llama-3.1-8B-FP8"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreaterEqual(metrics["accuracy"], 0.45)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
91
test/srt/models/test_cross_encoder_models.py
Normal file
91
test/srt/models/test_cross_encoder_models.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import multiprocessing as mp
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
from sglang.test.runners import TEST_RERANK_QUERY_DOCS, HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import CustomTestCase, is_in_ci
|
||||
|
||||
MODELS = [
|
||||
("cross-encoder/ms-marco-MiniLM-L6-v2", 1, 1e-2),
|
||||
("BAAI/bge-reranker-v2-m3", 1, 1e-2),
|
||||
]
|
||||
ATTENTION_BACKEND = ["torch_native", "triton"]
|
||||
|
||||
TORCH_DTYPES = [torch.float32]
|
||||
|
||||
|
||||
class TestCrossEncoderModels(CustomTestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def assert_close_prefill_logits(
|
||||
self,
|
||||
prompts,
|
||||
model_path,
|
||||
tp_size,
|
||||
torch_dtype,
|
||||
score_tolerance,
|
||||
attention_backend,
|
||||
) -> None:
|
||||
with HFRunner(
|
||||
model_path,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="cross_encoder",
|
||||
) as hf_runner:
|
||||
hf_scores = hf_runner.forward(prompts).scores
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
tp_size=tp_size,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="cross_encoder",
|
||||
attention_backend=attention_backend,
|
||||
chunked_prefill_size=-1,
|
||||
disable_radix_cache=True,
|
||||
) as srt_runner:
|
||||
srt_scores = srt_runner.forward(prompts).scores
|
||||
|
||||
for i in range(len(srt_scores)):
|
||||
score_difference = abs(hf_scores[i] - srt_scores[i])
|
||||
|
||||
assert (
|
||||
score_difference < score_tolerance
|
||||
), "cross encoder scores are not all close"
|
||||
|
||||
def preprocess_prompts(self, prompt):
|
||||
processed_prompts = []
|
||||
query = prompt["query"]
|
||||
documents = prompt["documents"]
|
||||
for document in documents:
|
||||
processed_prompts.append([query, document])
|
||||
|
||||
return processed_prompts
|
||||
|
||||
def test_prefill_logits(self):
|
||||
models_to_test = MODELS
|
||||
|
||||
if is_in_ci():
|
||||
models_to_test = [random.choice(MODELS)]
|
||||
|
||||
for model, tp_size, prefill_tolerance in models_to_test:
|
||||
for attention_backend in ATTENTION_BACKEND:
|
||||
for queryDocs in TEST_RERANK_QUERY_DOCS:
|
||||
prompts = self.preprocess_prompts(queryDocs)
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
self.assert_close_prefill_logits(
|
||||
prompts,
|
||||
model,
|
||||
tp_size,
|
||||
torch_dtype,
|
||||
prefill_tolerance,
|
||||
attention_backend,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
34
test/srt/models/test_dummy_grok_models.py
Normal file
34
test/srt/models/test_dummy_grok_models.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import unittest
|
||||
|
||||
from sglang.test.test_utils import CustomTestCase, is_in_ci, run_bench_one_batch
|
||||
|
||||
|
||||
class TestDummyGrok1(CustomTestCase):
|
||||
|
||||
def test_dummy_grok_1(self):
|
||||
_, output_throughput, _ = run_bench_one_batch(
|
||||
None,
|
||||
[
|
||||
"--model",
|
||||
"/dummy-grok",
|
||||
"--tokenizer-path",
|
||||
"Xenova/grok-1-tokenizer",
|
||||
"--batch-size",
|
||||
"2",
|
||||
"--tp",
|
||||
"2",
|
||||
"--quantization",
|
||||
"fp8",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
"--json-model-override-args",
|
||||
'{"num_hidden_layers": 2}',
|
||||
],
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
self.assertGreater(output_throughput, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
111
test/srt/models/test_embedding_models.py
Normal file
111
test/srt/models/test_embedding_models.py
Normal file
@@ -0,0 +1,111 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import multiprocessing as mp
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
|
||||
|
||||
MODELS = [
|
||||
("Alibaba-NLP/gte-Qwen2-1.5B-instruct", 1, 1e-5),
|
||||
("intfloat/e5-mistral-7b-instruct", 1, 1e-5),
|
||||
("marco/mcdse-2b-v1", 1, 1e-5),
|
||||
("Qwen/Qwen3-Embedding-8B", 1, 1e-5),
|
||||
# Temporarily disable before this model is fixed
|
||||
# ("jason9693/Qwen2.5-1.5B-apeach", 1, 1e-5),
|
||||
]
|
||||
TORCH_DTYPES = [torch.float16]
|
||||
|
||||
|
||||
class TestEmbeddingModels(CustomTestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def _truncate_prompts(self, prompts, model_path):
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
max_length = getattr(config, "max_position_embeddings", 2048)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
truncated_prompts = []
|
||||
for prompt in prompts:
|
||||
tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
|
||||
if len(tokens.input_ids[0]) > max_length:
|
||||
truncated_text = tokenizer.decode(
|
||||
tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
|
||||
)
|
||||
truncated_prompts.append(truncated_text)
|
||||
else:
|
||||
truncated_prompts.append(prompt)
|
||||
return truncated_prompts
|
||||
|
||||
def assert_close_prefill_logits(
|
||||
self,
|
||||
prompts,
|
||||
model_path,
|
||||
tp_size,
|
||||
torch_dtype,
|
||||
prefill_tolerance,
|
||||
) -> None:
|
||||
truncated_prompts = self._truncate_prompts(prompts, model_path)
|
||||
|
||||
with HFRunner(
|
||||
model_path,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as hf_runner:
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
tp_size=tp_size,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as srt_runner:
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
|
||||
for i in range(len(prompts)):
|
||||
hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
|
||||
srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
|
||||
|
||||
similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
|
||||
print("similarity diff", abs(similarity - 1))
|
||||
|
||||
if len(prompts[i]) <= 1000:
|
||||
assert torch.all(
|
||||
abs(similarity - 1) < prefill_tolerance
|
||||
), "embeddings are not all close"
|
||||
|
||||
def test_prefill_logits(self):
|
||||
models_to_test = MODELS
|
||||
|
||||
if is_in_ci():
|
||||
models_to_test = [random.choice(MODELS)]
|
||||
|
||||
for model, tp_size, prefill_tolerance in models_to_test:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
self.assert_close_prefill_logits(
|
||||
DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
162
test/srt/models/test_encoder_embedding_models.py
Normal file
162
test/srt/models/test_encoder_embedding_models.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
# python -m unittest test_encoder_embedding_models.TestEncoderEmbeddingModels.test_prefill_logits
|
||||
|
||||
import multiprocessing as mp
|
||||
import random
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
|
||||
|
||||
MODELS = [("BAAI/bge-small-en", 1, 1e-5), ("BAAI/bge-m3", 1, 1e-5)]
|
||||
|
||||
ATTENTION_BACKEND = ["torch_native", "triton", "flashinfer"]
|
||||
BATCH_SIZE = [1, 2]
|
||||
TORCH_DTYPES = [torch.float32, torch.float16]
|
||||
sgl_to_st_ratio = []
|
||||
|
||||
|
||||
class TestEncoderEmbeddingModels(CustomTestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def _truncate_prompts(self, prompts, model_path):
|
||||
config = AutoConfig.from_pretrained(model_path)
|
||||
max_length = getattr(config, "max_position_embeddings", 512) - 20
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
|
||||
truncated_prompts = []
|
||||
for prompt in prompts:
|
||||
tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
|
||||
if len(tokens.input_ids[0]) > max_length:
|
||||
truncated_text = tokenizer.decode(
|
||||
tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
|
||||
)
|
||||
truncated_prompts.append(truncated_text)
|
||||
else:
|
||||
truncated_prompts.append(prompt)
|
||||
|
||||
return truncated_prompts
|
||||
|
||||
def assert_close_prefill_logits(
|
||||
self,
|
||||
prompts,
|
||||
model_path,
|
||||
tp_size,
|
||||
torch_dtype,
|
||||
prefill_tolerance,
|
||||
attention_backend,
|
||||
batch_size,
|
||||
) -> None:
|
||||
truncated_prompts = self._truncate_prompts(prompts, model_path)
|
||||
truncated_prompts = truncated_prompts * batch_size
|
||||
|
||||
with HFRunner(
|
||||
model_path,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as hf_runner:
|
||||
# warm up
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
|
||||
st_start_time = time.perf_counter()
|
||||
hf_outputs = hf_runner.forward(truncated_prompts)
|
||||
st_end_time = time.perf_counter()
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
tp_size=tp_size,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
attention_backend=attention_backend,
|
||||
chunked_prefill_size=-1,
|
||||
disable_radix_cache=True,
|
||||
) as srt_runner:
|
||||
# warm up
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
|
||||
sgl_start_time = time.perf_counter()
|
||||
srt_outputs = srt_runner.forward(truncated_prompts)
|
||||
sgl_end_time = time.perf_counter()
|
||||
|
||||
transformer_time = st_end_time - st_start_time
|
||||
sgl_time = sgl_end_time - sgl_start_time
|
||||
sgl_to_st_ratio.append(sgl_time / transformer_time)
|
||||
|
||||
for i in range(len(truncated_prompts)):
|
||||
hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
|
||||
srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
|
||||
|
||||
similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
|
||||
# If something is wrong, uncomment this to observe similarity.
|
||||
# print("similarity diff", abs(similarity - 1))
|
||||
|
||||
if len(truncated_prompts[i]) <= 1000:
|
||||
assert torch.all(
|
||||
abs(similarity - 1) < prefill_tolerance
|
||||
), "embeddings are not all close"
|
||||
|
||||
def test_prefill_logits(self):
|
||||
models_to_test = MODELS
|
||||
|
||||
if is_in_ci():
|
||||
models_to_test = [random.choice(MODELS)]
|
||||
|
||||
for model, tp_size, prefill_tolerance in models_to_test:
|
||||
for attention_backend in ATTENTION_BACKEND:
|
||||
for batch_size in BATCH_SIZE:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
# NOTE: FlashInfer currently has limitations with head_dim = 32 or
|
||||
# other dimensions.
|
||||
# The FlashInfer head_dim limitation itself is tracked here:
|
||||
# https://github.com/flashinfer-ai/flashinfer/issues/1048
|
||||
#
|
||||
# Flashinfer does not support torch.float32 for dtype_q, so skip it
|
||||
if attention_backend == "flashinfer":
|
||||
if (
|
||||
model == "BAAI/bge-small-en"
|
||||
or torch_dtype == torch.float32
|
||||
):
|
||||
continue
|
||||
|
||||
self.assert_close_prefill_logits(
|
||||
DEFAULT_PROMPTS,
|
||||
model,
|
||||
tp_size,
|
||||
torch_dtype,
|
||||
prefill_tolerance,
|
||||
attention_backend,
|
||||
batch_size,
|
||||
)
|
||||
|
||||
for i in range(len(BATCH_SIZE)):
|
||||
print(
|
||||
"bacth size: ",
|
||||
BATCH_SIZE[i] * 5,
|
||||
"sgl_time/st_time",
|
||||
round(sgl_to_st_ratio[i], 3),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
181
test/srt/models/test_generation_models.py
Normal file
181
test/srt/models/test_generation_models.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
Usage:
|
||||
|
||||
To test a specific model locally:
|
||||
1. Add it to ALL_MODELS, for example, `ModelCase("Qwen/Qwen2-1.5B")`
|
||||
2. Run `ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels`
|
||||
"""
|
||||
|
||||
import dataclasses
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import random
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
|
||||
from sglang.test.runners import (
|
||||
DEFAULT_PROMPTS,
|
||||
HFRunner,
|
||||
SRTRunner,
|
||||
check_close_model_outputs,
|
||||
)
|
||||
from sglang.test.test_utils import CustomTestCase, is_in_ci
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ModelCase:
|
||||
model_path: str
|
||||
tp_size: int = 1
|
||||
prefill_tolerance: float = 5e-2
|
||||
decode_tolerance: float = 6e-2 # Increased to fix numerical error in issue #8614.
|
||||
rouge_l_tolerance: float = 1
|
||||
skip_long_prompt: bool = False
|
||||
trust_remote_code: bool = False
|
||||
|
||||
|
||||
# Popular models that run on the CI
|
||||
CI_MODELS = [
|
||||
ModelCase("meta-llama/Llama-3.1-8B-Instruct"),
|
||||
ModelCase("google/gemma-2-2b"),
|
||||
]
|
||||
|
||||
# the complete set of models to test sglang's generation model
|
||||
ALL_MODELS = [
|
||||
*CI_MODELS,
|
||||
ModelCase("Qwen/Qwen2-1.5B"),
|
||||
ModelCase("Qwen/Qwen2.5-14B-Instruct"),
|
||||
ModelCase("HuggingFaceTB/SmolLM-135M-Instruct", skip_long_prompt=True),
|
||||
ModelCase("allenai/OLMo-1B-0724-hf", decode_tolerance=8e-2, skip_long_prompt=True),
|
||||
ModelCase(
|
||||
"THUDM/glm-4-9b-chat", tp_size=2, trust_remote_code=True, skip_long_prompt=True
|
||||
),
|
||||
ModelCase("openai-community/gpt2"),
|
||||
ModelCase("microsoft/phi-1_5", trust_remote_code=True),
|
||||
ModelCase("adept/persimmon-8b-chat"),
|
||||
ModelCase("inclusionAI/Ling-lite", trust_remote_code=True),
|
||||
ModelCase("microsoft/Phi-3-small-8k-instruct", trust_remote_code=True),
|
||||
ModelCase("allenai/OLMo-2-1124-7B-Instruct", skip_long_prompt=True),
|
||||
ModelCase("ibm-granite/granite-3.0-2b-instruct", skip_long_prompt=True),
|
||||
ModelCase(
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
tp_size=2,
|
||||
trust_remote_code=True,
|
||||
skip_long_prompt=True,
|
||||
),
|
||||
ModelCase("facebook/opt-125m", skip_long_prompt=True),
|
||||
ModelCase(
|
||||
"nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
|
||||
tp_size=2,
|
||||
trust_remote_code=True,
|
||||
skip_long_prompt=True,
|
||||
),
|
||||
ModelCase(
|
||||
"nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
|
||||
tp_size=8,
|
||||
trust_remote_code=True,
|
||||
skip_long_prompt=True,
|
||||
),
|
||||
]
|
||||
|
||||
TORCH_DTYPES = [torch.float16]
|
||||
|
||||
|
||||
class TestGenerationModels(CustomTestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def assert_close_logits_and_output_strs(
|
||||
self,
|
||||
prompts: List[str],
|
||||
model_case: ModelCase,
|
||||
torch_dtype: torch.dtype,
|
||||
) -> None:
|
||||
model_path = model_case.model_path
|
||||
prefill_tolerance, decode_tolerance, rouge_l_tolerance = (
|
||||
model_case.prefill_tolerance,
|
||||
model_case.decode_tolerance,
|
||||
model_case.rouge_l_tolerance,
|
||||
)
|
||||
max_new_tokens = 32
|
||||
|
||||
with HFRunner(
|
||||
model_path,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="generation",
|
||||
trust_remote_code=model_case.trust_remote_code,
|
||||
) as hf_runner:
|
||||
hf_outputs = hf_runner.forward(prompts, max_new_tokens=max_new_tokens)
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
tp_size=model_case.tp_size,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="generation",
|
||||
trust_remote_code=model_case.trust_remote_code,
|
||||
) as srt_runner:
|
||||
srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
|
||||
|
||||
check_close_model_outputs(
|
||||
hf_outputs=hf_outputs,
|
||||
srt_outputs=srt_outputs,
|
||||
prefill_tolerance=model_case.prefill_tolerance,
|
||||
decode_tolerance=model_case.decode_tolerance,
|
||||
rouge_l_tolerance=model_case.rouge_l_tolerance,
|
||||
debug_text=f"model_path={model_path} prompts={prompts}",
|
||||
)
|
||||
|
||||
@unittest.skipIf(not is_in_ci(), "Local test should run all models")
|
||||
def test_ci_models(self):
|
||||
for model_case in CI_MODELS:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
prompts = DEFAULT_PROMPTS
|
||||
|
||||
# Skip long prompts for models that do not have a long context
|
||||
if model_case.skip_long_prompt:
|
||||
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
|
||||
|
||||
# Assert the logits and output strs are close
|
||||
self.assert_close_logits_and_output_strs(
|
||||
prompts, model_case, torch_dtype
|
||||
)
|
||||
|
||||
@unittest.skipIf(is_in_ci(), "CI only runs selected models for simplicity")
|
||||
def test_all_models(self):
|
||||
for model_case in ALL_MODELS:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
if (
|
||||
"ONLY_RUN" in os.environ
|
||||
and os.environ["ONLY_RUN"] != model_case.model_path
|
||||
):
|
||||
continue
|
||||
|
||||
# Skip long prompts for models that do not have a long context
|
||||
prompts = DEFAULT_PROMPTS
|
||||
if model_case.skip_long_prompt:
|
||||
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
|
||||
|
||||
# Assert the logits and output strs are close
|
||||
self.assert_close_logits_and_output_strs(
|
||||
prompts, model_case, torch_dtype
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
85
test/srt/models/test_gme_qwen_models.py
Normal file
85
test/srt/models/test_gme_qwen_models.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
import multiprocessing as mp
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from sglang.test.runners import HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import CustomTestCase, get_similarities
|
||||
|
||||
TEXTS = "two Subway Series sandwiches with meats, cheese, lettuce, tomatoes, and onions on a black background, accompanied by the Subway Series logo, highlighting a new sandwich series."
|
||||
IMAGES = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
|
||||
|
||||
|
||||
MODELS = [
|
||||
("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", 1e-3),
|
||||
]
|
||||
TORCH_DTYPES = [torch.float16]
|
||||
|
||||
|
||||
class TestQmeQwenModels(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def assert_close_embeddings(self, model, prefill_tolerance, torch_dtype):
|
||||
|
||||
prompts_no_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{TEXTS}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
|
||||
prompts_with_image = f"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n<|endoftext|>"
|
||||
with HFRunner(
|
||||
model,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as hf_runner:
|
||||
hf_text_embeddings = hf_runner.forward(prompts=[prompts_no_image])
|
||||
hf_image_embeddings = hf_runner.forward(
|
||||
prompts=[prompts_with_image], image_data=[IMAGES]
|
||||
)
|
||||
with SRTRunner(
|
||||
model,
|
||||
tp_size=1,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="embedding",
|
||||
) as srt_runner:
|
||||
srt_text_embeddings = srt_runner.forward(prompts=prompts_no_image)
|
||||
srt_image_embeddings = srt_runner.forward(
|
||||
prompts=prompts_with_image, image_data=IMAGES
|
||||
)
|
||||
|
||||
similarity = get_similarities(
|
||||
hf_text_embeddings.embed_logits[0], srt_text_embeddings.embed_logits[0]
|
||||
)
|
||||
print("texts similarity diff", abs(similarity - 1))
|
||||
assert torch.all(
|
||||
abs(similarity - 1) < prefill_tolerance
|
||||
), "embeddings are not all close"
|
||||
similarity = get_similarities(
|
||||
hf_image_embeddings.embed_logits[0], srt_image_embeddings.embed_logits[0]
|
||||
)
|
||||
print("images similarity diff", abs(similarity - 1))
|
||||
assert torch.all(
|
||||
abs(similarity - 1) < prefill_tolerance
|
||||
), "embeddings are not all close"
|
||||
|
||||
def test_accuracy(self):
|
||||
for model, prefill_tolerance in MODELS:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
self.assert_close_embeddings(model, prefill_tolerance, torch_dtype)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
53
test/srt/models/test_grok_models.py
Normal file
53
test/srt/models/test_grok_models.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestGrok(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "lmzheng/grok-1"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--load-format",
|
||||
"dummy",
|
||||
"--json-model-override-args",
|
||||
'{"num_hidden_layers": 2}',
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=64,
|
||||
max_new_tokens=256,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
|
||||
# It is dummy weights so we only assert the output throughput instead of accuracy.
|
||||
self.assertGreater(metrics["output_throughput"], 1000)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
74
test/srt/models/test_llama4_models.py
Normal file
74
test/srt/models/test_llama4_models.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import random
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
SimpleNamespace(
|
||||
model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
accuracy=0.9,
|
||||
tp_size=4,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class TestLlama4(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
|
||||
def test_gsm8k(self):
|
||||
|
||||
for model in MODELS:
|
||||
try:
|
||||
process = popen_launch_server(
|
||||
model.model,
|
||||
self.base_url,
|
||||
timeout=3 * DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--chat-template",
|
||||
"llama-4",
|
||||
"--tp-size",
|
||||
str(model.tp_size),
|
||||
"--mem-fraction-static",
|
||||
"0.8",
|
||||
"--context-length",
|
||||
"8192",
|
||||
],
|
||||
)
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreaterEqual(metrics["accuracy"], model.accuracy)
|
||||
except Exception as e:
|
||||
print(f"Error testing {model.model}: {e}")
|
||||
self.fail(f"Test failed for {model.model}: {e}")
|
||||
|
||||
finally:
|
||||
# Ensure process cleanup happens regardless of success/failure
|
||||
if process is not None and process.poll() is None:
|
||||
print(f"Cleaning up process {process.pid}")
|
||||
try:
|
||||
kill_process_tree(process.pid)
|
||||
except Exception as e:
|
||||
print(f"Error killing process: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
58
test/srt/models/test_mtp_models.py
Normal file
58
test/srt/models/test_mtp_models.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestMiMoMTP(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "XiaomiMiMo/MiMo-7B-RL"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--trust-remote-code",
|
||||
"--speculative-algorithm",
|
||||
"EAGLE",
|
||||
"--speculative-num-steps",
|
||||
"1",
|
||||
"--speculative-eagle-topk",
|
||||
"1",
|
||||
"--speculative-num-draft-tokens",
|
||||
"2",
|
||||
"--mem-fraction-static",
|
||||
"0.5",
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.7)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
77
test/srt/models/test_qwen_models.py
Normal file
77
test/srt/models/test_qwen_models.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestQwen2(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "Qwen/Qwen2-7B-Instruct"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.78)
|
||||
|
||||
|
||||
class TestQwen2FP8(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "neuralmagic/Qwen2-7B-Instruct-FP8"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.78)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
92
test/srt/models/test_reward_models.py
Normal file
92
test/srt/models/test_reward_models.py
Normal file
@@ -0,0 +1,92 @@
|
||||
# Copyright 2023-2024 SGLang Team
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
import multiprocessing as mp
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from sglang.test.runners import HFRunner, SRTRunner
|
||||
from sglang.test.test_utils import CustomTestCase
|
||||
|
||||
MODELS = [
|
||||
("LxzGordon/URM-LLaMa-3.1-8B", 1, 4e-2),
|
||||
("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", 1, 4e-2),
|
||||
]
|
||||
TORCH_DTYPES = [torch.float16]
|
||||
|
||||
# PROMPT = "Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits all her apples equally among herself and her 2 siblings. How many apples does each person get?"
|
||||
# RESPONSE1 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among herself and her 2 siblings (3 people in total). 9 ÷ 3 = 3 apples each. Each person gets 3 apples."
|
||||
# RESPONSE2 = "1. Jane starts with 12 apples and gives 4 to Mark. 12 - 4 = 8. Jane now has 8 apples.\n2. Jane buys 1 more apple. 8 + 1 = 9. Jane now has 9 apples.\n3. Jane splits the 9 apples equally among her 2 siblings (2 people in total). 9 ÷ 2 = 4.5 apples each. Each person gets 4 apples."
|
||||
|
||||
PROMPT = (
|
||||
"What is the range of the numeric output of a sigmoid node in a neural network?"
|
||||
)
|
||||
RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
|
||||
RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."
|
||||
|
||||
CONVS = [
|
||||
[{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE1}],
|
||||
[{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE2}],
|
||||
]
|
||||
|
||||
|
||||
class TestRewardModels(CustomTestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def assert_close_reward_scores(
|
||||
self,
|
||||
convs,
|
||||
model_path,
|
||||
tp_size,
|
||||
torch_dtype,
|
||||
tolerance,
|
||||
) -> None:
|
||||
with HFRunner(
|
||||
model_path,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="reward",
|
||||
) as hf_runner:
|
||||
hf_outputs = hf_runner.forward(convs)
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
torch_dtype=torch_dtype,
|
||||
model_type="reward",
|
||||
) as srt_runner:
|
||||
prompts = srt_runner.tokenizer.apply_chat_template(convs, tokenize=False)
|
||||
srt_outputs = srt_runner.forward(prompts)
|
||||
|
||||
hf_scores = torch.tensor(hf_outputs.scores)
|
||||
srt_scores = torch.tensor(srt_outputs.scores)
|
||||
print(f"{hf_scores=}")
|
||||
print(f"{srt_scores=}")
|
||||
|
||||
assert torch.all(
|
||||
abs(hf_scores - srt_scores) < tolerance
|
||||
), "reward scores are not all close"
|
||||
|
||||
def test_reward_scores(self):
|
||||
for model, tp_size, tolerance in MODELS:
|
||||
for torch_dtype in TORCH_DTYPES:
|
||||
self.assert_close_reward_scores(
|
||||
CONVS, model, tp_size, torch_dtype, tolerance
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
181
test/srt/models/test_transformers_models.py
Normal file
181
test/srt/models/test_transformers_models.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import dataclasses
|
||||
import multiprocessing as mp
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner, check_close_model_outputs
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
is_in_ci,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestTransformersFallbackEndpoint(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=["--model-impl", "transformers"],
|
||||
)
|
||||
cls.mmlu_lower_bound = 0.65
|
||||
cls.gsm8k_lower_bound = 0.65
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_mmlu(self):
|
||||
args = SimpleNamespace(
|
||||
base_url=self.base_url,
|
||||
model=self.model,
|
||||
eval_name="mmlu",
|
||||
num_examples=64,
|
||||
num_threads=32,
|
||||
)
|
||||
from sglang.test.run_eval import run_eval
|
||||
|
||||
metrics = run_eval(args)
|
||||
self.assertGreaterEqual(metrics["score"], self.mmlu_lower_bound)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], self.gsm8k_lower_bound)
|
||||
|
||||
|
||||
class TestTransformersFallbackTorchAO(TestTransformersFallbackEndpoint):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--model-impl",
|
||||
"transformers",
|
||||
"--torchao-config",
|
||||
"int4wo-128",
|
||||
],
|
||||
)
|
||||
cls.mmlu_lower_bound = 0.65
|
||||
cls.gsm8k_lower_bound = 0.65
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ModelCase:
|
||||
model_path: str
|
||||
tp_size: int = 1
|
||||
prefill_tolerance: float = 5e-2
|
||||
decode_tolerance: float = 5e-2
|
||||
rouge_l_tolerance: float = 1
|
||||
skip_long_prompt: bool = False
|
||||
trust_remote_code: bool = False
|
||||
torchao_config: str = None
|
||||
torch_dtype: torch.dtype = torch.float16
|
||||
|
||||
|
||||
# Popular models that run on the CI
|
||||
CI_MODELS = [
|
||||
ModelCase(DEFAULT_MODEL_NAME_FOR_TEST),
|
||||
]
|
||||
|
||||
ALL_OTHER_MODELS = [
|
||||
ModelCase(DEFAULT_MODEL_NAME_FOR_TEST, tp_size=2),
|
||||
]
|
||||
|
||||
|
||||
class TestTransformersFallbackEngine(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
mp.set_start_method("spawn", force=True)
|
||||
|
||||
def assert_close_logits_and_output_strs(
|
||||
self,
|
||||
prompts: List[str],
|
||||
model_case: ModelCase,
|
||||
) -> None:
|
||||
model_path = model_case.model_path
|
||||
max_new_tokens = 32
|
||||
# force to use transformers impl
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
tp_size=model_case.tp_size,
|
||||
torch_dtype=model_case.torch_dtype,
|
||||
model_type="generation",
|
||||
model_impl="transformers",
|
||||
trust_remote_code=model_case.trust_remote_code,
|
||||
torchao_config=model_case.torchao_config,
|
||||
) as srt_runner:
|
||||
srt_outputs = srt_runner.forward(prompts, max_new_tokens=max_new_tokens)
|
||||
|
||||
with SRTRunner(
|
||||
model_path,
|
||||
tp_size=model_case.tp_size,
|
||||
torch_dtype=model_case.torch_dtype,
|
||||
model_type="generation",
|
||||
trust_remote_code=model_case.trust_remote_code,
|
||||
torchao_config=model_case.torchao_config,
|
||||
) as srt_runner:
|
||||
srt_transformers_outputs = srt_runner.forward(
|
||||
prompts, max_new_tokens=max_new_tokens
|
||||
)
|
||||
|
||||
check_close_model_outputs(
|
||||
hf_outputs=srt_transformers_outputs,
|
||||
srt_outputs=srt_outputs,
|
||||
prefill_tolerance=model_case.prefill_tolerance,
|
||||
decode_tolerance=model_case.decode_tolerance,
|
||||
rouge_l_tolerance=model_case.rouge_l_tolerance,
|
||||
debug_text=f"model_path={model_path} prompts={prompts}",
|
||||
)
|
||||
|
||||
def test_ci_models(self):
|
||||
for model_case in CI_MODELS:
|
||||
# Skip long prompts for models that do not have a long context
|
||||
prompts = DEFAULT_PROMPTS
|
||||
if model_case.skip_long_prompt:
|
||||
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
|
||||
# Assert the logits and output strs are close
|
||||
self.assert_close_logits_and_output_strs(prompts, model_case)
|
||||
|
||||
def test_others(self):
|
||||
if is_in_ci():
|
||||
return
|
||||
|
||||
# Skip long prompts for models that do not have a long context
|
||||
prompts = DEFAULT_PROMPTS
|
||||
for model_case in ALL_OTHER_MODELS:
|
||||
if model_case.skip_long_prompt:
|
||||
prompts = [p for p in DEFAULT_PROMPTS if len(p) < 1000]
|
||||
|
||||
# Assert the logits and output strs are close
|
||||
self.assert_close_logits_and_output_strs(prompts, model_case)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
213
test/srt/models/test_unsloth_models.py
Normal file
213
test/srt/models/test_unsloth_models.py
Normal file
@@ -0,0 +1,213 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
class TestUnslothPhi4(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "unsloth/phi-4"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.78)
|
||||
|
||||
|
||||
class TestUnslothPhi4Bnb4bit(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "unsloth/phi-4-bnb-4bit"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--load-format",
|
||||
"bitsandbytes",
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.75)
|
||||
|
||||
|
||||
class TestUnslothPhi4UnslothBnb4bit(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "unsloth/phi-4-unsloth-bnb-4bit"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--load-format",
|
||||
"bitsandbytes",
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.75)
|
||||
|
||||
|
||||
class TestUnslothPhi4MiniInstruct(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "unsloth/Phi-4-mini-instruct"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.65)
|
||||
|
||||
|
||||
class TestUnslothPhi4MiniBnb4bit(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "unsloth/Phi-4-mini-instruct-bnb-4bit"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--load-format",
|
||||
"bitsandbytes",
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.6)
|
||||
|
||||
|
||||
class TestUnslothPhi4MiniUnslothBnb4bit(CustomTestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model = "unsloth/Phi-4-mini-instruct-unsloth-bnb-4bit"
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.process = popen_launch_server(
|
||||
cls.model,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--load-format",
|
||||
"bitsandbytes",
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
kill_process_tree(cls.process.pid)
|
||||
|
||||
def test_gsm8k(self):
|
||||
args = SimpleNamespace(
|
||||
num_shots=5,
|
||||
data_path=None,
|
||||
num_questions=200,
|
||||
max_new_tokens=512,
|
||||
parallel=128,
|
||||
host="http://127.0.0.1",
|
||||
port=int(self.base_url.split(":")[-1]),
|
||||
)
|
||||
metrics = run_eval(args)
|
||||
print(f"{metrics=}")
|
||||
self.assertGreater(metrics["accuracy"], 0.6)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
315
test/srt/models/test_vlm_models.py
Normal file
315
test/srt/models/test_vlm_models.py
Normal file
@@ -0,0 +1,315 @@
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
import sys
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
CustomTestCase,
|
||||
is_in_ci,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
# VLM models for testing
|
||||
MODELS = [
|
||||
SimpleNamespace(model="google/gemma-3-27b-it", mmmu_accuracy=0.45),
|
||||
SimpleNamespace(
|
||||
model="Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
mmmu_accuracy=0.4,
|
||||
),
|
||||
SimpleNamespace(model="openbmb/MiniCPM-V-2_6", mmmu_accuracy=0.4),
|
||||
]
|
||||
|
||||
|
||||
class TestVLMModels(CustomTestCase):
|
||||
parsed_args = None # Class variable to store args
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
# Removed argument parsing from here
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
cls.api_key = "sk-123456"
|
||||
cls.time_out = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
|
||||
|
||||
# Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work.
|
||||
os.environ["OPENAI_API_KEY"] = cls.api_key
|
||||
os.environ["OPENAI_API_BASE"] = f"{cls.base_url}/v1"
|
||||
|
||||
def _detect_eviction_in_logs(self, log_output):
|
||||
"""Detect if eviction events occurred in the log output."""
|
||||
eviction_keywords = ["Cache eviction: evicted"]
|
||||
|
||||
eviction_detected = False
|
||||
eviction_count = 0
|
||||
|
||||
for line in log_output.split("\n"):
|
||||
if any(keyword in line for keyword in eviction_keywords):
|
||||
eviction_detected = True
|
||||
eviction_count += 1
|
||||
print(f"Eviction detected: {line.strip()}")
|
||||
|
||||
return eviction_detected, eviction_count
|
||||
|
||||
def run_mmmu_eval(
|
||||
self,
|
||||
model_version: str,
|
||||
output_path: str,
|
||||
*,
|
||||
env: dict | None = None,
|
||||
):
|
||||
"""
|
||||
Evaluate a VLM on the MMMU validation set with lmms‑eval.
|
||||
Only `model_version` (checkpoint) and `chat_template` vary;
|
||||
We are focusing only on the validation set due to resource constraints.
|
||||
"""
|
||||
# -------- fixed settings --------
|
||||
model = "openai_compatible"
|
||||
tp = 1
|
||||
tasks = "mmmu_val"
|
||||
batch_size = 2
|
||||
log_suffix = "openai_compatible"
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
|
||||
# -------- compose --model_args --------
|
||||
model_args = f'model_version="{model_version}",' f"tp={tp}"
|
||||
|
||||
# -------- build command list --------
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"lmms_eval",
|
||||
"--model",
|
||||
model,
|
||||
"--model_args",
|
||||
model_args,
|
||||
"--tasks",
|
||||
tasks,
|
||||
"--batch_size",
|
||||
str(batch_size),
|
||||
"--log_samples",
|
||||
"--log_samples_suffix",
|
||||
log_suffix,
|
||||
"--output_path",
|
||||
str(output_path),
|
||||
]
|
||||
|
||||
subprocess.run(
|
||||
cmd,
|
||||
check=True,
|
||||
timeout=3600,
|
||||
)
|
||||
|
||||
def _run_vlm_mmmu_test(
|
||||
self,
|
||||
model,
|
||||
output_path,
|
||||
test_name="",
|
||||
custom_env=None,
|
||||
log_level="info",
|
||||
capture_output=False,
|
||||
):
|
||||
"""
|
||||
Common method to run VLM MMMU benchmark test.
|
||||
|
||||
Args:
|
||||
model: Model to test
|
||||
output_path: Path for output logs
|
||||
test_name: Optional test name for logging
|
||||
custom_env: Optional custom environment variables
|
||||
log_level: Log level for server (default: "info")
|
||||
capture_output: Whether to capture server stdout/stderr
|
||||
"""
|
||||
print(f"\nTesting model: {model.model}{test_name}")
|
||||
|
||||
process = None
|
||||
mmmu_accuracy = 0 # Initialize to handle potential exceptions
|
||||
server_output = ""
|
||||
|
||||
try:
|
||||
# Prepare environment variables
|
||||
process_env = os.environ.copy()
|
||||
if custom_env:
|
||||
process_env.update(custom_env)
|
||||
|
||||
# Prepare stdout/stderr redirection if needed
|
||||
stdout_file = None
|
||||
stderr_file = None
|
||||
if capture_output:
|
||||
stdout_file = open("/tmp/server_stdout.log", "w")
|
||||
stderr_file = open("/tmp/server_stderr.log", "w")
|
||||
|
||||
# Launch server for testing
|
||||
process = popen_launch_server(
|
||||
model.model,
|
||||
base_url=self.base_url,
|
||||
timeout=self.time_out,
|
||||
api_key=self.api_key,
|
||||
other_args=[
|
||||
"--trust-remote-code",
|
||||
"--cuda-graph-max-bs",
|
||||
"32",
|
||||
"--enable-multimodal",
|
||||
"--mem-fraction-static",
|
||||
str(self.parsed_args.mem_fraction_static), # Use class variable
|
||||
"--log-level",
|
||||
log_level,
|
||||
],
|
||||
env=process_env,
|
||||
return_stdout_stderr=(
|
||||
(stdout_file, stderr_file) if capture_output else None
|
||||
),
|
||||
)
|
||||
|
||||
# Run evaluation
|
||||
self.run_mmmu_eval(model.model, output_path)
|
||||
|
||||
# Get the result file
|
||||
result_file_path = glob.glob(f"{output_path}/*.json")[0]
|
||||
|
||||
with open(result_file_path, "r") as f:
|
||||
result = json.load(f)
|
||||
print(f"Result{test_name}\n: {result}")
|
||||
|
||||
# Process the result
|
||||
mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"]
|
||||
print(
|
||||
f"Model {model.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}"
|
||||
)
|
||||
|
||||
# Capture server output if requested
|
||||
if capture_output and process:
|
||||
server_output = self._read_output_from_files()
|
||||
|
||||
# Assert performance meets expected threshold
|
||||
self.assertGreaterEqual(
|
||||
mmmu_accuracy,
|
||||
model.mmmu_accuracy,
|
||||
f"Model {model.model} accuracy ({mmmu_accuracy:.4f}) below expected threshold ({model.mmmu_accuracy:.4f}){test_name}",
|
||||
)
|
||||
|
||||
return server_output
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error testing {model.model}{test_name}: {e}")
|
||||
self.fail(f"Test failed for {model.model}{test_name}: {e}")
|
||||
|
||||
finally:
|
||||
# Ensure process cleanup happens regardless of success/failure
|
||||
if process is not None and process.poll() is None:
|
||||
print(f"Cleaning up process {process.pid}")
|
||||
try:
|
||||
kill_process_tree(process.pid)
|
||||
except Exception as e:
|
||||
print(f"Error killing process: {e}")
|
||||
|
||||
# clean up temporary files
|
||||
if capture_output:
|
||||
if stdout_file:
|
||||
stdout_file.close()
|
||||
if stderr_file:
|
||||
stderr_file.close()
|
||||
for filename in ["/tmp/server_stdout.log", "/tmp/server_stderr.log"]:
|
||||
try:
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
except Exception as e:
|
||||
print(f"Error removing {filename}: {e}")
|
||||
|
||||
def _read_output_from_files(self):
|
||||
output_lines = []
|
||||
|
||||
log_files = [
|
||||
("/tmp/server_stdout.log", "[STDOUT]"),
|
||||
("/tmp/server_stderr.log", "[STDERR]"),
|
||||
]
|
||||
for filename, tag in log_files:
|
||||
try:
|
||||
if os.path.exists(filename):
|
||||
with open(filename, "r") as f:
|
||||
for line in f:
|
||||
output_lines.append(f"{tag} {line.rstrip()}")
|
||||
except Exception as e:
|
||||
print(f"Error reading {tag.lower()} file: {e}")
|
||||
|
||||
return "\n".join(output_lines)
|
||||
|
||||
def test_vlm_mmmu_benchmark(self):
|
||||
"""Test VLM models against MMMU benchmark."""
|
||||
models_to_test = MODELS
|
||||
|
||||
if is_in_ci():
|
||||
models_to_test = [random.choice(MODELS)]
|
||||
|
||||
for model in models_to_test:
|
||||
self._run_vlm_mmmu_test(model, "./logs")
|
||||
|
||||
def test_vlm_mmmu_benchmark_with_small_cache(self):
|
||||
"""Test VLM models against MMMU benchmark with a small embedding cache to force eviction."""
|
||||
models_to_test = MODELS
|
||||
|
||||
if is_in_ci():
|
||||
models_to_test = [random.choice(MODELS)]
|
||||
|
||||
for model in models_to_test:
|
||||
custom_env = {"SGLANG_VLM_CACHE_SIZE_MB": "5"}
|
||||
|
||||
# Run the test with output capture
|
||||
server_output = self._run_vlm_mmmu_test(
|
||||
model,
|
||||
"./logs_small_cache",
|
||||
test_name=" with small embedding cache (evict test)",
|
||||
custom_env=custom_env,
|
||||
log_level="debug", # Enable debug logging for eviction detection
|
||||
capture_output=True, # Capture server output
|
||||
)
|
||||
|
||||
# Print server output for debugging
|
||||
print("Server output:\n", server_output)
|
||||
|
||||
# Analyze server output for eviction events
|
||||
eviction_detected, eviction_count = self._detect_eviction_in_logs(
|
||||
server_output
|
||||
)
|
||||
|
||||
# Assert that eviction was detected (since we're using small cache)
|
||||
self.assertTrue(
|
||||
eviction_detected,
|
||||
f"Expected eviction events to be detected with small cache (5MB), but none found. "
|
||||
f"Cache size may be too large for the workload or eviction logic may not be working. "
|
||||
f"Total log content length: {len(server_output)} characters",
|
||||
)
|
||||
|
||||
print(
|
||||
f"Eviction detection summary: {eviction_count} eviction events detected"
|
||||
)
|
||||
|
||||
# Additional assertion: if eviction was detected, the test passed
|
||||
if eviction_detected:
|
||||
print("✅ Eviction logic successfully triggered and detected!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Define and parse arguments here, before unittest.main
|
||||
parser = argparse.ArgumentParser(description="Test VLM models")
|
||||
parser.add_argument(
|
||||
"--mem-fraction-static",
|
||||
type=float,
|
||||
help="Static memory fraction for the model",
|
||||
default=0.8,
|
||||
)
|
||||
|
||||
# Parse args intended for unittest
|
||||
args = parser.parse_args()
|
||||
|
||||
# Store the parsed args object on the class
|
||||
TestVLMModels.parsed_args = args
|
||||
|
||||
# Pass args to unittest
|
||||
unittest.main(argv=[sys.argv[0]])
|
||||
Reference in New Issue
Block a user