diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index a2706015d..59f425c07 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -78,6 +78,8 @@ async def async_request_trt_llm(
             "top_p": 1.0,
             "max_tokens": request_func_input.output_len,
             "stream": True,
+            "min_length": request_func_input.output_len,
+            "end_id": 1048576,
         }
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -111,6 +113,7 @@ async def async_request_trt_llm(
 
                     output.latency = most_recent_timestamp - st
                     output.success = True
+                    output.output_len = request_func_input.output_len
 
                 else:
                     output.error = response.reason or ""
@@ -244,9 +247,11 @@ class BenchmarkMetrics:
     completed: int
     total_input: int
     total_output: int
+    total_output_retokenized: int
     request_throughput: float
     input_throughput: float
     output_throughput: float
+    output_throughput_retokenized: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -455,7 +460,8 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     backend: str,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
     total_input = 0
     completed = 0
     itls: List[float] = []
@@ -463,17 +469,12 @@ def calculate_metrics(
     ttfts: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
-            # We use the tokenizer solely to count output tokens for the TensorRT LLM backend,
-            # as it lacks `ignore_eos` support.
-            if backend == "trt":
-                output_len = len(
-                    tokenizer(
-                        outputs[i].generated_text, add_special_tokens=False
-                    ).input_ids
-                )
-            else:
-                output_len = outputs[i].output_len
-            actual_output_lens.append(output_len)
+            output_len = outputs[i].output_len
+            output_lens.append(output_len)
+            retokenized_output_len = len(
+                tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
+            )
+            retokenized_output_lens.append(retokenized_output_len)
             total_input += input_requests[i][1]
             if output_len > 1:
                 tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
@@ -481,7 +482,8 @@ def calculate_metrics(
             ttfts.append(outputs[i].ttft)
             completed += 1
         else:
-            actual_output_lens.append(0)
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
 
     if completed == 0:
         warnings.warn(
@@ -492,10 +494,12 @@ def calculate_metrics(
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
-        total_output=sum(actual_output_lens),
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
         request_throughput=completed / dur_s,
         input_throughput=total_input / dur_s,
-        output_throughput=sum(actual_output_lens) / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -511,7 +515,7 @@ def calculate_metrics(
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
     )
 
-    return metrics, actual_output_lens
+    return metrics, output_lens
 
 
 async def benchmark(
@@ -572,7 +576,7 @@ async def benchmark(
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
 
-    metrics, actual_output_lens = calculate_metrics(
+    metrics, output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,
         dur_s=benchmark_duration,
@@ -587,6 +591,11 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
     print(
         "{:<40} {:<10.2f}".format(
             "Request throughput (req/s):", metrics.request_throughput
@@ -629,6 +638,7 @@ async def benchmark(
             "request_rate": request_rate,
             "total_input": metrics.total_input,
             "total_output": metrics.total_output,
+            "total_output_retokenized": metrics.total_output_retokenized,
             "median_ttft": metrics.median_ttft_ms,
             "median_itl": metrics.mean_itl_ms,
             "output_token_throughput": metrics.output_throughput,
@@ -661,6 +671,7 @@ async def benchmark(
         "completed": metrics.completed,
         "total_input_tokens": metrics.total_input,
         "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
         "request_throughput": metrics.request_throughput,
         "input_throughput": metrics.input_throughput,
         "output_throughput": metrics.output_throughput,
@@ -677,7 +688,7 @@ async def benchmark(
         "std_itl_ms": metrics.std_itl_ms,
         "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
+        "output_lens": output_lens,
         "ttfts": [output.ttft for output in outputs],
         "itls": [output.itl for output in outputs],
         "generated_texts": [output.generated_text for output in outputs],