refactor(test): reorganize OpenAI test file structure (#7408)

This commit is contained in:
Chang Su
2025-06-21 19:37:48 -07:00
committed by GitHub
parent 1998ce4046
commit b7a2df0a44
27 changed files with 350 additions and 294 deletions

View File

@@ -0,0 +1,103 @@
"""
python3 -m unittest openai_server.validation.test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion
"""
import os
import time
import unittest
from concurrent.futures import ThreadPoolExecutor
import openai
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
STDERR_FILENAME,
STDOUT_FILENAME,
CustomTestCase,
popen_launch_server,
)
class TestLargeMaxNewTokens(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.stdout = open(STDOUT_FILENAME, "w")
cls.stderr = open(STDERR_FILENAME, "w")
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=(
"--max-total-token",
"1536",
"--context-len",
"8192",
"--decode-log-interval",
"2",
),
env={"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256", **os.environ},
return_stdout_stderr=(cls.stdout, cls.stderr),
)
cls.base_url += "/v1"
cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
cls.stdout.close()
cls.stderr.close()
os.remove(STDOUT_FILENAME)
os.remove(STDERR_FILENAME)
def run_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
response = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{
"role": "user",
"content": "Please repeat the world 'hello' for 10000 times.",
},
],
temperature=0,
)
return response
def test_chat_completion(self):
num_requests = 4
futures = []
with ThreadPoolExecutor(num_requests) as executor:
# Send multiple requests
for i in range(num_requests):
futures.append(executor.submit(self.run_chat_completion))
# Ensure that they are running concurrently
pt = 0
while pt >= 0:
time.sleep(5)
lines = open(STDERR_FILENAME).readlines()
for line in lines[pt:]:
print(line, end="", flush=True)
if f"#running-req: {num_requests}" in line:
all_requests_running = True
pt = -1
break
pt += 1
assert all_requests_running
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,140 @@
import json
import unittest
import requests
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
MANY_NEW_TOKENS_PROMPT = """
Please write an extremely detailed and vivid fantasy story, set in a world full of intricate magic systems, political intrigue, and complex characters.
Ensure that you thoroughly describe every scene, character's motivations, and the environment. Include long, engaging dialogues and elaborate on the inner thoughts of the characters.
Each section should be as comprehensive as possible to create a rich and immersive experience for the reader.
The story should span multiple events, challenges, and character developments over time. Aim to make the story at least 3,000 words long.
"""
class TestMatchedStop(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=300,
other_args=["--max-running-requests", "10"],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def run_completions_generation(
self,
prompt=MANY_NEW_TOKENS_PROMPT,
max_tokens=1,
stop=None,
finish_reason=None,
matched_stop=None,
):
payload = {
"prompt": prompt,
"model": self.model,
"temperature": 0,
"top_p": 1,
"max_tokens": max_tokens,
}
if stop is not None:
payload["stop"] = stop
response_completions = requests.post(
self.base_url + "/v1/completions",
json=payload,
)
print(json.dumps(response_completions.json()))
print("=" * 100)
assert (
response_completions.json()["choices"][0]["finish_reason"] == finish_reason
)
assert response_completions.json()["choices"][0]["matched_stop"] == matched_stop
def run_chat_completions_generation(
self,
prompt=MANY_NEW_TOKENS_PROMPT,
max_tokens=1,
stop=None,
finish_reason=None,
matched_stop=None,
):
chat_payload = {
"model": self.model,
"messages": [
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": prompt},
],
"temperature": 0,
"top_p": 1,
"max_tokens": max_tokens,
}
if stop is not None:
chat_payload["stop"] = stop
response_chat = requests.post(
self.base_url + "/v1/chat/completions",
json=chat_payload,
)
print(json.dumps(response_chat.json()))
print("=" * 100)
assert response_chat.json()["choices"][0]["finish_reason"] == finish_reason
assert response_chat.json()["choices"][0]["matched_stop"] == matched_stop
def test_finish_stop_str(self):
self.run_completions_generation(
max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
)
self.run_chat_completions_generation(
max_tokens=1000, stop="\n", finish_reason="stop", matched_stop="\n"
)
def test_finish_stop_eos(self):
llama_format_prompt = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
What is 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
eos_token_id = 128009
self.run_completions_generation(
prompt=llama_format_prompt,
max_tokens=1000,
finish_reason="stop",
matched_stop=eos_token_id,
)
self.run_chat_completions_generation(
prompt="What is 2 + 2?",
max_tokens=1000,
finish_reason="stop",
matched_stop=eos_token_id,
)
def test_finish_length(self):
self.run_completions_generation(
max_tokens=5, finish_reason="length", matched_stop=None
)
self.run_chat_completions_generation(
max_tokens=5, finish_reason="length", matched_stop=None
)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,84 @@
import openai
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestOpenAIServerIgnoreEOS(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
)
cls.base_url += "/v1"
cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_ignore_eos(self):
"""
Test that ignore_eos=True allows generation to continue beyond EOS token
and reach the max_tokens limit.
"""
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
max_tokens = 200
response_default = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Count from 1 to 20."},
],
temperature=0,
max_tokens=max_tokens,
extra_body={"ignore_eos": False},
)
response_ignore_eos = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Count from 1 to 20."},
],
temperature=0,
max_tokens=max_tokens,
extra_body={"ignore_eos": True},
)
default_tokens = len(
self.tokenizer.encode(response_default.choices[0].message.content)
)
ignore_eos_tokens = len(
self.tokenizer.encode(response_ignore_eos.choices[0].message.content)
)
# Check if ignore_eos resulted in more tokens or exactly max_tokens
# The ignore_eos response should either:
# 1. Have more tokens than the default response (if default stopped at EOS before max_tokens)
# 2. Have exactly max_tokens (if it reached the max_tokens limit)
self.assertTrue(
ignore_eos_tokens > default_tokens or ignore_eos_tokens >= max_tokens,
f"ignore_eos did not generate more tokens: {ignore_eos_tokens} vs {default_tokens}",
)
self.assertEqual(
response_ignore_eos.choices[0].finish_reason,
"length",
f"Expected finish_reason='length' for ignore_eos=True, got {response_ignore_eos.choices[0].finish_reason}",
)

View File

@@ -0,0 +1,88 @@
import unittest
import openai
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestRequestLengthValidation(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
# Start server with auto truncate disabled
cls.process = popen_launch_server(
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=("--max-total-tokens", "1000", "--context-length", "1000"),
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_input_length_longer_than_context_length(self):
client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
long_text = "hello " * 1200 # Will tokenize to more than context length
with self.assertRaises(openai.BadRequestError) as cm:
client.chat.completions.create(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
messages=[
{"role": "user", "content": long_text},
],
temperature=0,
)
self.assertIn("is longer than the model's context length", str(cm.exception))
def test_input_length_longer_than_maximum_allowed_length(self):
client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
long_text = "hello " * 999 # the maximum allowed length is 994 tokens
with self.assertRaises(openai.BadRequestError) as cm:
client.chat.completions.create(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
messages=[
{"role": "user", "content": long_text},
],
temperature=0,
)
self.assertIn("is longer than the model's context length", str(cm.exception))
def test_max_tokens_validation(self):
client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1")
long_text = "hello "
with self.assertRaises(openai.BadRequestError) as cm:
client.chat.completions.create(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
messages=[
{"role": "user", "content": long_text},
],
temperature=0,
max_tokens=1200,
)
self.assertIn(
"Requested token count exceeds the model's maximum context",
str(cm.exception),
)
if __name__ == "__main__":
unittest.main()