Decoder-only Scoring API (#6460)

Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
This commit is contained in:
Chanh Nguyen
2025-06-04 14:14:54 -07:00
committed by GitHub
parent cf9815ba69
commit 3f1e433903
7 changed files with 612 additions and 2 deletions

View File

@@ -10,6 +10,7 @@ import time
import unittest
import openai
import requests
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_process_tree
@@ -599,7 +600,6 @@ class TestOpenAIServerEBNF(CustomTestCase):
extra_body={"ebnf": ebnf_grammar},
)
text = response.choices[0].message.content.strip()
print("EBNF test output:", repr(text))
self.assertTrue(len(text) > 0, "Got empty text from EBNF generation")
self.assertRegex(text, pattern, f"Text '{text}' doesn't match EBNF choices")
@@ -630,7 +630,6 @@ class TestOpenAIServerEBNF(CustomTestCase):
extra_body={"ebnf": ebnf_grammar},
)
text = response.choices[0].message.content.strip()
print("EBNF strict JSON test output:", repr(text))
self.assertTrue(len(text) > 0, "Got empty text from EBNF strict JSON test")
self.assertRegex(
text, pattern, f"Text '{text}' not matching the EBNF strict JSON shape"
@@ -766,5 +765,168 @@ class TestOpenAIServerIgnoreEOS(CustomTestCase):
)
class TestOpenAIV1Score(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
)
cls.base_url += "/v1/score"
cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def run_score(
self, query, items, label_token_ids, apply_softmax=False, item_first=False
):
response = requests.post(
self.base_url,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={
"model": self.model,
"query": query,
"items": items,
"label_token_ids": label_token_ids,
"apply_softmax": apply_softmax,
"item_first": item_first,
},
)
return response.json()
def test_score_text_input(self):
"""Test scoring with text input"""
query = "The capital of France is"
items = ["Paris", "London", "Berlin"]
# Get valid token IDs from the tokenizer
label_token_ids = []
for item in items:
token_ids = self.tokenizer.encode(item, add_special_tokens=False)
if not token_ids:
self.fail(f"Failed to encode item: {item}")
label_token_ids.append(token_ids[0])
response = self.run_score(query, items, label_token_ids, apply_softmax=True)
# Handle error responses
if response.get("type") == "BadRequestError":
self.fail(f"Score request failed with error: {response['message']}")
# Verify response structure
self.assertIn("scores", response, "Response should have a 'scores' field")
self.assertIsInstance(response["scores"], list, "scores should be a list")
self.assertEqual(
len(response["scores"]),
len(items),
"Number of scores should match number of items",
)
# Each score should be a list of floats in the order of label_token_ids
for i, score_list in enumerate(response["scores"]):
self.assertIsInstance(score_list, list, f"Score {i} should be a list")
self.assertEqual(
len(score_list),
len(label_token_ids),
f"Score {i} length should match label_token_ids",
)
self.assertTrue(
all(isinstance(v, float) for v in score_list),
f"Score {i} values should be floats",
)
self.assertAlmostEqual(
sum(score_list),
1.0,
places=6,
msg=f"Score {i} probabilities should sum to 1",
)
def test_score_token_input(self):
"""Test scoring with token IDs input"""
query = "The capital of France is"
items = ["Paris", "London", "Berlin"]
# Get valid token IDs
query_ids = self.tokenizer.encode(query, add_special_tokens=False)
item_ids = [
self.tokenizer.encode(item, add_special_tokens=False) for item in items
]
label_token_ids = [
ids[0] for ids in item_ids if ids
] # Get first token ID of each item
response = self.run_score(
query_ids, item_ids, label_token_ids, apply_softmax=True
)
# Handle error responses
if response.get("type") == "BadRequestError":
self.fail(f"Score request failed with error: {response['message']}")
# Verify response structure
self.assertIn("scores", response, "Response should have a 'scores' field")
self.assertIsInstance(response["scores"], list, "scores should be a list")
self.assertEqual(
len(response["scores"]),
len(items),
"Number of scores should match number of items",
)
# Each score should be a list of floats in the order of label_token_ids
for i, score_list in enumerate(response["scores"]):
self.assertIsInstance(score_list, list, f"Score {i} should be a list")
self.assertEqual(
len(score_list),
len(label_token_ids),
f"Score {i} length should match label_token_ids",
)
self.assertTrue(
all(isinstance(v, float) for v in score_list),
f"Score {i} values should be floats",
)
self.assertAlmostEqual(
sum(score_list),
1.0,
places=6,
msg=f"Score {i} probabilities should sum to 1",
)
def test_score_error_handling(self):
"""Test error handling for invalid inputs"""
query = "The capital of France is"
items = ["Paris", "London", "Berlin"]
# Test with invalid token ID
response = requests.post(
self.base_url,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={
"model": self.model,
"query": query,
"items": items,
"label_token_ids": [999999], # Invalid token ID
"apply_softmax": True,
},
)
self.assertEqual(response.status_code, 400)
error_response = response.json()
self.assertEqual(error_response["type"], "BadRequestError")
self.assertIn("Token ID 999999 is out of vocabulary", error_response["message"])
if __name__ == "__main__":
unittest.main()

218
test/srt/test_score_api.py Normal file
View File

@@ -0,0 +1,218 @@
import unittest
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sglang.srt.entrypoints.engine import Engine
from sglang.test.test_utils import DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase
TEST_MODEL_NAME = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
class TestScoreAPI(CustomTestCase):
"""Test the scoring API functionality."""
def setUp(self):
"""Set up each test case."""
self.engine = Engine(model_path=TEST_MODEL_NAME)
def tearDown(self):
"""Clean up after each test case."""
if self.engine is not None:
self.engine.shutdown()
torch.cuda.empty_cache()
def compute_hf_scores(
self, query, items, label_token_ids, apply_softmax=False, item_first=False
):
"""Compute scores using direct HuggingFace model inference.
Returns probabilities for each token ID, optionally normalized with softmax.
Args:
query: The query text
items: List of item texts
label_token_ids: List of token IDs to compute probabilities for
apply_softmax: Whether to normalize probabilities using softmax
item_first: If True, prepend items to query. Otherwise append items to query.
"""
# Initialize HF model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
TEST_MODEL_NAME, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
TEST_MODEL_NAME, trust_remote_code=True
)
try:
scores = []
for item in items:
# Construct full text based on item_first parameter
full_text = f"{item}{query}" if item_first else f"{query}{item}"
inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
# Get logits for the last token
with torch.no_grad():
outputs = model(**inputs)
last_token_logits = outputs.logits[0, -1]
# Get logits for just our target tokens
target_logits = last_token_logits[label_token_ids]
# Apply softmax over just the target tokens
target_probs = torch.softmax(target_logits, dim=-1)
# Convert to list of probabilities in order of label_token_ids
probs = [target_probs[i].item() for i in range(len(label_token_ids))]
scores.append(probs)
return scores
finally:
# Clean up HF resources
model.cpu()
del model
del tokenizer
torch.cuda.empty_cache()
def _get_token_ids(self, tokens):
"""Helper method to get token IDs for a list of tokens."""
tokenizer = AutoTokenizer.from_pretrained(
TEST_MODEL_NAME, trust_remote_code=True
)
try:
label_token_ids = []
for token in tokens:
encoding = tokenizer.encode_plus(token, add_special_tokens=False)
token_ids = encoding["input_ids"]
label_token_ids.append(token_ids[0])
return label_token_ids
finally:
del tokenizer
def _compare_scores(self, hf_scores, sglang_scores, label_token_ids, case_name=""):
"""Helper method to compare scores between HF and SGLang using relative tolerance."""
self.assertEqual(
len(hf_scores),
len(sglang_scores),
f"Score lengths don't match for {case_name}",
)
# Use a relative tolerance of 1%
TOLERANCE = 0.01
for hf_score_list, sglang_score_list in zip(hf_scores, sglang_scores):
self.assertEqual(
len(hf_score_list),
len(sglang_score_list),
f"Score list lengths don't match for {case_name}",
)
for hf_score, sglang_score in zip(hf_score_list, sglang_score_list):
diff = abs(hf_score - sglang_score)
self.assertLessEqual(
diff,
TOLERANCE,
msg=f"Scores differ by {diff:.2%} ({case_name}): "
f"HF={hf_score:.6f}, SGLang={sglang_score:.6f}",
)
self.assertGreaterEqual(
sglang_score, 0, f"SGLang score {sglang_score:.6f} not in [0,1]"
)
self.assertLessEqual(
sglang_score, 1, f"SGLang score {sglang_score:.6f} not in [0,1]"
)
self.assertAlmostEqual(
sum(sglang_score_list),
1.0,
places=6,
msg=f"SGLang scores don't sum to 1 ({case_name}): {sum(sglang_score_list):.6f}",
)
def test_score_consistency(self):
"""Test that SGLang scoring matches direct HuggingFace model scoring."""
# Define test cases
test_cases = [
{
"name": "default case",
"query": "I pledge allegiance",
"items": ["", " to"],
"item_first": False,
},
{
"name": "item_first case",
"query": " is a city",
"items": ["Tokyo", "Japan"],
"item_first": True,
},
]
# Common tokens to test for all cases
tokens = [" to", " the"]
label_token_ids = self._get_token_ids(tokens)
# Run each test case
for case in test_cases:
# Get scores from SGLang
sglang_scores = self.engine.score(
query=case["query"],
items=case["items"],
label_token_ids=label_token_ids,
apply_softmax=True,
item_first=case["item_first"],
)
# Get scores from HuggingFace using the same parameters
hf_scores = self.compute_hf_scores(
query=case["query"],
items=case["items"],
label_token_ids=label_token_ids,
apply_softmax=True,
item_first=case["item_first"],
)
# Compare scores
self._compare_scores(
hf_scores, sglang_scores, label_token_ids, case["name"]
)
def test_score_batch_handling(self):
"""Test that batch scoring works correctly."""
# Test with different batch sizes
batch_sizes = [1, 2, 4, 8]
label_token_ids = [1, 2, 3]
for batch_size in batch_sizes:
texts = [f"test {i}" for i in range(batch_size)]
scores = self.engine.score(
query="The test was",
items=texts,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(
len(scores),
batch_size,
f"Expected {batch_size} scores, got {len(scores)}",
)
# Verify each score list has the correct length
for score_list in scores:
self.assertEqual(
len(score_list),
len(label_token_ids),
f"Score list length {len(score_list)} doesn't match label_token_ids length {len(label_token_ids)}",
)
self.assertTrue(
all(isinstance(v, float) for v in score_list),
"All scores should be floats",
)
self.assertAlmostEqual(
1.0, sum(score_list), 6, "Scores should sum to 1"
)
if __name__ == "__main__":
unittest.main()