Decoder-only Scoring API (#6460)
Co-authored-by: Chanh Nguyen <cnguyen@linkedin.com>
This commit is contained in:
@@ -472,6 +472,79 @@ class Engine(EngineBase):
|
||||
def save_sharded_model(self, **kwargs):
|
||||
self.collective_rpc("save_sharded_model", **kwargs)
|
||||
|
||||
def score(
|
||||
self,
|
||||
query: Optional[Union[str, List[int]]] = None,
|
||||
items: Optional[Union[str, List[str], List[List[int]]]] = None,
|
||||
label_token_ids: Optional[List[int]] = None,
|
||||
apply_softmax: bool = False,
|
||||
item_first: bool = False,
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
|
||||
query = "<|user|>Is the following city the capital of France? "
|
||||
items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
|
||||
label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
|
||||
item_first = False
|
||||
|
||||
This would pass the following prompts to the model:
|
||||
"<|user|>Is the following city the capital of France? Paris <|assistant|>"
|
||||
"<|user|>Is the following city the capital of France? London <|assistant|>"
|
||||
"<|user|>Is the following city the capital of France? Berlin <|assistant|>"
|
||||
The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
|
||||
The output would look like:
|
||||
[[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]
|
||||
|
||||
|
||||
Args:
|
||||
query: The query text or pre-tokenized query token IDs. Must be provided.
|
||||
items: The item text(s) or pre-tokenized item token IDs. Must be provided.
|
||||
label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
|
||||
apply_softmax: Whether to normalize probabilities using softmax.
|
||||
item_first: If True, prepend items to query. Otherwise append items to query.
|
||||
|
||||
Returns:
|
||||
List of dictionaries mapping token IDs to their probabilities for each item.
|
||||
Each dictionary in the list corresponds to one item input.
|
||||
|
||||
Raises:
|
||||
ValueError: If query is not provided, or if items is not provided,
|
||||
or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
return loop.run_until_complete(
|
||||
self.tokenizer_manager.score_request(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=apply_softmax,
|
||||
item_first=item_first,
|
||||
request=None,
|
||||
)
|
||||
)
|
||||
|
||||
async def async_score(
|
||||
self,
|
||||
query: Optional[Union[str, List[int]]] = None,
|
||||
items: Optional[Union[str, List[str], List[List[int]]]] = None,
|
||||
label_token_ids: Optional[List[int]] = None,
|
||||
apply_softmax: bool = False,
|
||||
item_first: bool = False,
|
||||
) -> List[List[float]]:
|
||||
"""
|
||||
Asynchronous version of score method.
|
||||
|
||||
See score() for detailed documentation.
|
||||
"""
|
||||
return await self.tokenizer_manager.score_request(
|
||||
query=query,
|
||||
items=items,
|
||||
label_token_ids=label_token_ids,
|
||||
apply_softmax=apply_softmax,
|
||||
item_first=item_first,
|
||||
request=None,
|
||||
)
|
||||
|
||||
|
||||
def _set_envs_and_config(server_args: ServerArgs):
|
||||
# Set global environments
|
||||
|
||||
Reference in New Issue
Block a user