[Generative Score API] Multi-Item scoring with custom attention mask. (#10979)

2025-10-08 18:47:32 -07:00
parent e22b13c569
commit 53bd00d975
10 changed files with 1121 additions and 129 deletions
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -114,6 +114,7 @@ GLOBAL_SERVER_ARGS_KEYS = [
    "enable_deterministic_inference",
    "nsa_prefill",
    "nsa_decode",
+    "multi_item_scoring_delimiter",
 ]

 # Put some global args for easy access
@@ -666,9 +667,11 @@ class Req:
    def is_prefill_only(self) -> bool:
        """Check if this request is prefill-only (no token generation needed)."""
        # NOTE: when spec is enabled, prefill_only optimizations are disabled
-        return (
-            self.sampling_params.max_new_tokens == 0
-            and global_server_args_dict["speculative_algorithm"] is None
+        from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+
+        spec_alg = global_server_args_dict["speculative_algorithm"]
+        return self.sampling_params.max_new_tokens == 0 and (
+            spec_alg is None or spec_alg == SpeculativeAlgorithm.NONE
        )

    def add_latency(self, stage: RequestStage):