[Generative Score API] Multi-Item scoring with custom attention mask. (#10979)
This commit is contained in:
committed by
GitHub
parent
e22b13c569
commit
53bd00d975
@@ -295,6 +295,296 @@ class TestScoreAPI(CustomTestCase):
|
||||
)
|
||||
self.assertFalse(request.stream, "Scoring requests should not stream")
|
||||
|
||||
def test_multi_item_scoring_basic(self):
|
||||
"""Test basic multi-item scoring functionality."""
|
||||
# Test with a simple query and items
|
||||
query = "What is the capital of California? Answer Yes or No for each of the following options:"
|
||||
items = ["Sacramento", "San Jose", "San Francisco"]
|
||||
label_token_ids = [9454, 2753] # "Yes" and "No" tokens
|
||||
|
||||
# Get scores using SGLang
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
# Verify we get the expected number of scores
|
||||
self.assertEqual(len(scores), len(items), "Should get one score list per item")
|
||||
|
||||
# Verify each score list has the correct length
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(
|
||||
len(score_list),
|
||||
len(label_token_ids),
|
||||
f"Item {i} should have {len(label_token_ids)} scores",
|
||||
)
|
||||
# Verify scores are probabilities (sum to 1)
|
||||
self.assertAlmostEqual(
|
||||
sum(score_list),
|
||||
1.0,
|
||||
places=6,
|
||||
msg=f"Scores for item {i} should sum to 1",
|
||||
)
|
||||
# Verify all scores are non-negative
|
||||
for j, score in enumerate(score_list):
|
||||
self.assertGreaterEqual(
|
||||
score, 0, f"Score {j} for item {i} should be non-negative"
|
||||
)
|
||||
|
||||
def test_multi_item_scoring_consistency(self):
|
||||
"""Test that multi-item scoring gives consistent results."""
|
||||
query = "Choose the best option:"
|
||||
items = ["Option A", "Option B", "Option C"]
|
||||
label_token_ids = [1, 2, 3]
|
||||
|
||||
# Run the same test multiple times
|
||||
scores1 = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
scores2 = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
# Results should be identical (deterministic)
|
||||
self.assertEqual(len(scores1), len(scores2), "Should get same number of items")
|
||||
for i, (s1, s2) in enumerate(zip(scores1, scores2)):
|
||||
self.assertEqual(
|
||||
len(s1), len(s2), f"Item {i} should have same number of scores"
|
||||
)
|
||||
for j, (score1, score2) in enumerate(zip(s1, s2)):
|
||||
self.assertAlmostEqual(
|
||||
score1,
|
||||
score2,
|
||||
places=6,
|
||||
msg=f"Score {j} for item {i} should be identical",
|
||||
)
|
||||
|
||||
def test_multi_item_scoring_different_sizes(self):
|
||||
"""Test multi-item scoring with different numbers of items."""
|
||||
query = "Rate each option:"
|
||||
label_token_ids = [1, 2, 3, 4, 5]
|
||||
|
||||
# Test with different numbers of items
|
||||
test_cases = [
|
||||
["Single item"],
|
||||
["Item 1", "Item 2"],
|
||||
["A", "B", "C", "D"],
|
||||
["X", "Y", "Z", "W", "V", "U"],
|
||||
]
|
||||
|
||||
for items in test_cases:
|
||||
with self.subTest(items=items):
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
len(scores), len(items), f"Should get {len(items)} score lists"
|
||||
)
|
||||
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(
|
||||
len(score_list),
|
||||
len(label_token_ids),
|
||||
f"Item {i} should have {len(label_token_ids)} scores",
|
||||
)
|
||||
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
|
||||
|
||||
def test_multi_item_scoring_empty_items(self):
|
||||
"""Test multi-item scoring with empty items list."""
|
||||
query = "Test query"
|
||||
items = []
|
||||
label_token_ids = [1, 2]
|
||||
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(len(scores), 0, "Should return empty list for empty items")
|
||||
|
||||
def test_multi_item_scoring_single_item(self):
|
||||
"""Test multi-item scoring with single item (should work like regular scoring)."""
|
||||
query = "Complete this sentence: The capital of France is"
|
||||
items = ["Paris"]
|
||||
label_token_ids = [1, 2, 3]
|
||||
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(len(scores), 1, "Should get one score list")
|
||||
self.assertEqual(
|
||||
len(scores[0]), len(label_token_ids), "Should have correct number of scores"
|
||||
)
|
||||
self.assertAlmostEqual(sum(scores[0]), 1.0, places=6)
|
||||
|
||||
def test_multi_item_scoring_different_queries(self):
|
||||
"""Test multi-item scoring with different types of queries."""
|
||||
items = ["Yes", "No"]
|
||||
label_token_ids = [1, 2]
|
||||
|
||||
test_queries = [
|
||||
"Is this true?",
|
||||
"Choose the correct answer:",
|
||||
"What is the best option?",
|
||||
"Select all that apply:",
|
||||
"", # Empty query
|
||||
]
|
||||
|
||||
for query in test_queries:
|
||||
with self.subTest(query=query):
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
len(scores),
|
||||
len(items),
|
||||
f"Should get {len(items)} score lists for query: '{query}'",
|
||||
)
|
||||
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(len(score_list), len(label_token_ids))
|
||||
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
|
||||
|
||||
def test_multi_item_scoring_different_label_tokens(self):
|
||||
"""Test multi-item scoring with different label token sets."""
|
||||
query = "Choose the best option:"
|
||||
items = ["Option A", "Option B"]
|
||||
|
||||
test_label_tokens = [
|
||||
[1, 2], # Two tokens
|
||||
[1, 2, 3, 4], # Four tokens
|
||||
[1], # Single token
|
||||
[1, 2, 3, 4, 5, 6, 7, 8], # Many tokens
|
||||
]
|
||||
|
||||
for label_token_ids in test_label_tokens:
|
||||
with self.subTest(label_tokens=label_token_ids):
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(len(scores), len(items))
|
||||
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(
|
||||
len(score_list),
|
||||
len(label_token_ids),
|
||||
f"Item {i} should have {len(label_token_ids)} scores",
|
||||
)
|
||||
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
|
||||
|
||||
def test_multi_item_scoring_without_softmax(self):
|
||||
"""Test multi-item scoring without softmax normalization."""
|
||||
query = "Rate each option:"
|
||||
items = ["Good", "Bad", "Neutral"]
|
||||
label_token_ids = [1, 2, 3]
|
||||
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=False, # No softmax
|
||||
)
|
||||
|
||||
self.assertEqual(len(scores), len(items))
|
||||
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(len(score_list), len(label_token_ids))
|
||||
# Without softmax, scores don't need to sum to 1
|
||||
# But they should still be valid logits/probabilities
|
||||
for j, score in enumerate(score_list):
|
||||
self.assertIsInstance(
|
||||
score, (int, float), f"Score {j} for item {i} should be numeric"
|
||||
)
|
||||
|
||||
def test_multi_item_scoring_large_batch(self):
|
||||
"""Test multi-item scoring with a large number of items."""
|
||||
query = "Classify each item:"
|
||||
items = [f"Item {i}" for i in range(20)] # 20 items
|
||||
label_token_ids = [1, 2, 3]
|
||||
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(len(scores), len(items), "Should handle large batches")
|
||||
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(len(score_list), len(label_token_ids))
|
||||
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
|
||||
|
||||
def test_multi_item_scoring_unicode(self):
|
||||
"""Test multi-item scoring with unicode characters."""
|
||||
query = "选择最佳选项:"
|
||||
items = ["选项A", "选项B", "选项C"]
|
||||
label_token_ids = [1, 2, 3]
|
||||
|
||||
scores = self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
self.assertEqual(len(scores), len(items))
|
||||
|
||||
for i, score_list in enumerate(scores):
|
||||
self.assertEqual(len(score_list), len(label_token_ids))
|
||||
self.assertAlmostEqual(sum(score_list), 1.0, places=6)
|
||||
|
||||
def test_multi_item_scoring_error_handling(self):
|
||||
"""Test multi-item scoring error handling."""
|
||||
query = "Test query"
|
||||
items = ["Item 1", "Item 2"]
|
||||
label_token_ids = [1, 2]
|
||||
|
||||
# Test with invalid label_token_ids
|
||||
with self.assertRaises((ValueError, TypeError)):
|
||||
self.engine.score(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids="invalid", # Should be list of ints
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
# Test with None items
|
||||
with self.assertRaises((ValueError, TypeError)):
|
||||
self.engine.score(
|
||||
query=query,
|
||||
items=None,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=True,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user