From 0aa65f94f156c0a4404dc841e996be78608976b5 Mon Sep 17 00:00:00 2001
From: Binyao Jiang <byjiang1996@gmail.com>
Date: Sun, 12 Oct 2025 15:04:28 -0700
Subject: [PATCH] [Fix] Improve longbench prompt and other logics (#11474)

---
 python/sglang/test/run_eval.py                |  1 +
 python/sglang/test/simple_eval_common.py      |  3 ++
 .../sglang/test/simple_eval_longbench_v2.py   | 54 +++++++++++--------
 3 files changed, 37 insertions(+), 21 deletions(-)
diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py
index 67b8c321f..0ecb8370d 100644
--- a/python/sglang/test/run_eval.py
+++ b/python/sglang/test/run_eval.py
@@ -103,6 +103,7 @@ def run_eval(args):
         categories = args.categories.split(",") if args.categories else None
 
         eval_obj = LongBenchV2Eval(
+            model=args.model,
             data_source=data_source,
             num_examples=args.num_examples,
             num_threads=args.num_threads,
diff --git a/python/sglang/test/simple_eval_common.py b/python/sglang/test/simple_eval_common.py
index dabebe8e6..53243fda9 100644
--- a/python/sglang/test/simple_eval_common.py
+++ b/python/sglang/test/simple_eval_common.py
@@ -290,6 +290,9 @@ def aggregate_results(
     htmls = []
     convos = []
     for single_eval_result in single_eval_results:
+        # Skip None results
+        if single_eval_result is None:
+            continue
         for name, value in single_eval_result.metrics.items():
             name2values[name].append(value)
         if single_eval_result.score is not None:
diff --git a/python/sglang/test/simple_eval_longbench_v2.py b/python/sglang/test/simple_eval_longbench_v2.py
index d47e9274d..645b76e38 100644
--- a/python/sglang/test/simple_eval_longbench_v2.py
+++ b/python/sglang/test/simple_eval_longbench_v2.py
@@ -12,6 +12,8 @@ import os
 import re
 from typing import Any, Dict, List, Optional
 
+from transformers import AutoTokenizer
+
 from sglang.test import simple_eval_common as common
 from sglang.test.simple_eval_common import (
     ANSWER_PATTERN_MULTICHOICE,
@@ -55,7 +57,11 @@ def format_longbench_v2_question(row: dict) -> str:
         choice_D = row.get("D", row.get("choice_D", ""))
 
     # Official LongBench-v2 template
-    prompt = f"""{context.strip()}
+    prompt = f"""
+Please read the following text and answer the question below.
+<text>
+{context.strip()}
+</text>
 
 What is the correct answer to this question: {question.strip()}
 Choices:
@@ -64,7 +70,7 @@ Choices:
 (C) {choice_C.strip()}
 (D) {choice_D.strip()}
 
-The correct answer is"""
+Format your response as follows: "The correct answer is (insert answer here)"."""
 
     return prompt
 
@@ -106,6 +112,7 @@ class LongBenchV2Eval(Eval):
 
     def __init__(
         self,
+        model: str = None,
         data_source: str = DEFAULT_DATASET,
         num_examples: Optional[int] = None,
         num_threads: int = 1,
@@ -126,6 +133,9 @@ class LongBenchV2Eval(Eval):
             max_context_length: Maximum context length in characters
             min_context_length: Minimum context length in characters
         """
+        self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+        self.min_context_length = min_context_length
+        self.max_context_length = max_context_length
         # Load dataset based on data source type
         examples = self._load_dataset(data_source)
 
@@ -133,11 +143,6 @@ class LongBenchV2Eval(Eval):
         if categories:
             examples = [ex for ex in examples if ex.get("category") in categories]
 
-        if min_context_length or max_context_length:
-            examples = self._filter_by_context_length(
-                examples, min_context_length, max_context_length
-            )
-
         # Sample examples if specified
         if num_examples:
             assert n_repeats == 1, "n_repeats only supported when not sampling examples"
@@ -246,26 +251,23 @@ class LongBenchV2Eval(Eval):
 
         return normalized
 
-    def _filter_by_context_length(
+    def _check_context_length(
         self,
-        examples: List[Dict[str, Any]],
+        formatted_question: str,
+        tokenizer: AutoTokenizer,
         min_length: Optional[int],
         max_length: Optional[int],
-    ) -> List[Dict[str, Any]]:
+    ) -> bool:
         """Filter examples by context length measured in characters."""
-        filtered = []
-        for example in examples:
-            context = example.get("context", "")
-            context_length = len(context)
+        input_ids = tokenizer.encode(formatted_question)
+        context_length = len(input_ids)
 
-            if min_length is not None and context_length < min_length:
-                continue
-            if max_length is not None and context_length > max_length:
-                continue
+        if min_length is not None and context_length < min_length:
+            return False
+        if max_length is not None and context_length > max_length:
+            return False
 
-            filtered.append(example)
-
-        return filtered
+        return True
 
     def __call__(self, sampler: SamplerBase) -> EvalResult:
         """Run the evaluation."""
@@ -274,6 +276,16 @@ class LongBenchV2Eval(Eval):
             # Format the question using official template
             formatted_question = format_longbench_v2_question(row)
 
+            if self.min_context_length or self.max_context_length:
+                if not self._check_context_length(
+                    formatted_question,
+                    self.tokenizer,
+                    self.min_context_length,
+                    self.max_context_length,
+                ):
+                    # Skip this example
+                    return None
+
             prompt_messages = [
                 sampler._pack_message(content=formatted_question, role="user")
             ]