[Generative Score API] Multi-Item scoring with custom attention mask. (#10979)
This commit is contained in:
committed by
GitHub
parent
e22b13c569
commit
53bd00d975
@@ -114,6 +114,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
|
||||
"enable_deterministic_inference",
|
||||
"nsa_prefill",
|
||||
"nsa_decode",
|
||||
"multi_item_scoring_delimiter",
|
||||
]
|
||||
|
||||
# Put some global args for easy access
|
||||
@@ -666,9 +667,11 @@ class Req:
|
||||
def is_prefill_only(self) -> bool:
|
||||
"""Check if this request is prefill-only (no token generation needed)."""
|
||||
# NOTE: when spec is enabled, prefill_only optimizations are disabled
|
||||
return (
|
||||
self.sampling_params.max_new_tokens == 0
|
||||
and global_server_args_dict["speculative_algorithm"] is None
|
||||
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
||||
|
||||
spec_alg = global_server_args_dict["speculative_algorithm"]
|
||||
return self.sampling_params.max_new_tokens == 0 and (
|
||||
spec_alg is None or spec_alg == SpeculativeAlgorithm.NONE
|
||||
)
|
||||
|
||||
def add_latency(self, stage: RequestStage):
|
||||
|
||||
Reference in New Issue
Block a user