[Generative Score API] Multi-Item scoring with custom attention mask. (#10979)

This commit is contained in:
Sundara Raman Ramachandran
2025-10-08 18:47:32 -07:00
committed by GitHub
parent e22b13c569
commit 53bd00d975
10 changed files with 1121 additions and 129 deletions

View File

@@ -295,6 +295,296 @@ class TestScoreAPI(CustomTestCase):
)
self.assertFalse(request.stream, "Scoring requests should not stream")
def test_multi_item_scoring_basic(self):
"""Test basic multi-item scoring functionality."""
# Test with a simple query and items
query = "What is the capital of California? Answer Yes or No for each of the following options:"
items = ["Sacramento", "San Jose", "San Francisco"]
label_token_ids = [9454, 2753] # "Yes" and "No" tokens
# Get scores using SGLang
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
# Verify we get the expected number of scores
self.assertEqual(len(scores), len(items), "Should get one score list per item")
# Verify each score list has the correct length
for i, score_list in enumerate(scores):
self.assertEqual(
len(score_list),
len(label_token_ids),
f"Item {i} should have {len(label_token_ids)} scores",
)
# Verify scores are probabilities (sum to 1)
self.assertAlmostEqual(
sum(score_list),
1.0,
places=6,
msg=f"Scores for item {i} should sum to 1",
)
# Verify all scores are non-negative
for j, score in enumerate(score_list):
self.assertGreaterEqual(
score, 0, f"Score {j} for item {i} should be non-negative"
)
def test_multi_item_scoring_consistency(self):
"""Test that multi-item scoring gives consistent results."""
query = "Choose the best option:"
items = ["Option A", "Option B", "Option C"]
label_token_ids = [1, 2, 3]
# Run the same test multiple times
scores1 = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
scores2 = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
# Results should be identical (deterministic)
self.assertEqual(len(scores1), len(scores2), "Should get same number of items")
for i, (s1, s2) in enumerate(zip(scores1, scores2)):
self.assertEqual(
len(s1), len(s2), f"Item {i} should have same number of scores"
)
for j, (score1, score2) in enumerate(zip(s1, s2)):
self.assertAlmostEqual(
score1,
score2,
places=6,
msg=f"Score {j} for item {i} should be identical",
)
def test_multi_item_scoring_different_sizes(self):
"""Test multi-item scoring with different numbers of items."""
query = "Rate each option:"
label_token_ids = [1, 2, 3, 4, 5]
# Test with different numbers of items
test_cases = [
["Single item"],
["Item 1", "Item 2"],
["A", "B", "C", "D"],
["X", "Y", "Z", "W", "V", "U"],
]
for items in test_cases:
with self.subTest(items=items):
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(
len(scores), len(items), f"Should get {len(items)} score lists"
)
for i, score_list in enumerate(scores):
self.assertEqual(
len(score_list),
len(label_token_ids),
f"Item {i} should have {len(label_token_ids)} scores",
)
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
def test_multi_item_scoring_empty_items(self):
"""Test multi-item scoring with empty items list."""
query = "Test query"
items = []
label_token_ids = [1, 2]
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(len(scores), 0, "Should return empty list for empty items")
def test_multi_item_scoring_single_item(self):
"""Test multi-item scoring with single item (should work like regular scoring)."""
query = "Complete this sentence: The capital of France is"
items = ["Paris"]
label_token_ids = [1, 2, 3]
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(len(scores), 1, "Should get one score list")
self.assertEqual(
len(scores[0]), len(label_token_ids), "Should have correct number of scores"
)
self.assertAlmostEqual(sum(scores[0]), 1.0, places=6)
def test_multi_item_scoring_different_queries(self):
"""Test multi-item scoring with different types of queries."""
items = ["Yes", "No"]
label_token_ids = [1, 2]
test_queries = [
"Is this true?",
"Choose the correct answer:",
"What is the best option?",
"Select all that apply:",
"", # Empty query
]
for query in test_queries:
with self.subTest(query=query):
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(
len(scores),
len(items),
f"Should get {len(items)} score lists for query: '{query}'",
)
for i, score_list in enumerate(scores):
self.assertEqual(len(score_list), len(label_token_ids))
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
def test_multi_item_scoring_different_label_tokens(self):
"""Test multi-item scoring with different label token sets."""
query = "Choose the best option:"
items = ["Option A", "Option B"]
test_label_tokens = [
[1, 2], # Two tokens
[1, 2, 3, 4], # Four tokens
[1], # Single token
[1, 2, 3, 4, 5, 6, 7, 8], # Many tokens
]
for label_token_ids in test_label_tokens:
with self.subTest(label_tokens=label_token_ids):
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(len(scores), len(items))
for i, score_list in enumerate(scores):
self.assertEqual(
len(score_list),
len(label_token_ids),
f"Item {i} should have {len(label_token_ids)} scores",
)
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
def test_multi_item_scoring_without_softmax(self):
"""Test multi-item scoring without softmax normalization."""
query = "Rate each option:"
items = ["Good", "Bad", "Neutral"]
label_token_ids = [1, 2, 3]
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=False, # No softmax
)
self.assertEqual(len(scores), len(items))
for i, score_list in enumerate(scores):
self.assertEqual(len(score_list), len(label_token_ids))
# Without softmax, scores don't need to sum to 1
# But they should still be valid logits/probabilities
for j, score in enumerate(score_list):
self.assertIsInstance(
score, (int, float), f"Score {j} for item {i} should be numeric"
)
def test_multi_item_scoring_large_batch(self):
"""Test multi-item scoring with a large number of items."""
query = "Classify each item:"
items = [f"Item {i}" for i in range(20)] # 20 items
label_token_ids = [1, 2, 3]
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(len(scores), len(items), "Should handle large batches")
for i, score_list in enumerate(scores):
self.assertEqual(len(score_list), len(label_token_ids))
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
def test_multi_item_scoring_unicode(self):
"""Test multi-item scoring with unicode characters."""
query = "选择最佳选项:"
items = ["选项A", "选项B", "选项C"]
label_token_ids = [1, 2, 3]
scores = self.engine.score(
query=query,
items=items,
label_token_ids=label_token_ids,
apply_softmax=True,
)
self.assertEqual(len(scores), len(items))
for i, score_list in enumerate(scores):
self.assertEqual(len(score_list), len(label_token_ids))
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
def test_multi_item_scoring_error_handling(self):
"""Test multi-item scoring error handling."""
query = "Test query"
items = ["Item 1", "Item 2"]
label_token_ids = [1, 2]
# Test with invalid label_token_ids
with self.assertRaises((ValueError, TypeError)):
self.engine.score(
query=query,
items=items,
label_token_ids="invalid", # Should be list of ints
apply_softmax=True,
)
# Test with None items
with self.assertRaises((ValueError, TypeError)):
self.engine.score(
query=query,
items=None,
label_token_ids=label_token_ids,
apply_softmax=True,
)
if __name__ == "__main__":
unittest.main()