misc: update output token logic (#695)
This commit is contained in:
@@ -54,6 +54,7 @@ class RequestFuncOutput:
|
|||||||
itl: List[float] = field(default_factory=list) # List of inter-token latencies
|
itl: List[float] = field(default_factory=list) # List of inter-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
error: str = ""
|
error: str = ""
|
||||||
|
output_len: int = 0
|
||||||
|
|
||||||
|
|
||||||
def remove_prefix(text: str, prefix: str) -> str:
|
def remove_prefix(text: str, prefix: str) -> str:
|
||||||
@@ -189,6 +190,7 @@ async def async_request_openai_completions(
|
|||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.success = True
|
output.success = True
|
||||||
output.latency = latency
|
output.latency = latency
|
||||||
|
output.output_len = request_func_input.output_len
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
@@ -451,6 +453,7 @@ def calculate_metrics(
|
|||||||
outputs: List[RequestFuncOutput],
|
outputs: List[RequestFuncOutput],
|
||||||
dur_s: float,
|
dur_s: float,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
backend: str,
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
actual_output_lens: List[int] = []
|
actual_output_lens: List[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@@ -460,13 +463,16 @@ def calculate_metrics(
|
|||||||
ttfts: List[float] = []
|
ttfts: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
# We use the tokenizer to count the number of output tokens for all
|
# We use the tokenizer solely to count output tokens for the TensorRT LLM backend,
|
||||||
# serving backends instead of looking at len(outputs[i].itl) since
|
# as it lacks `ignore_eos` support.
|
||||||
# multiple output tokens may be bundled together
|
if backend == "trt":
|
||||||
# Note : this may inflate the output token count slightly
|
output_len = len(
|
||||||
output_len = len(
|
tokenizer(
|
||||||
tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
|
outputs[i].generated_text, add_special_tokens=False
|
||||||
)
|
).input_ids
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_len = outputs[i].output_len
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i][1]
|
total_input += input_requests[i][1]
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
@@ -571,9 +577,11 @@ async def benchmark(
|
|||||||
outputs=outputs,
|
outputs=outputs,
|
||||||
dur_s=benchmark_duration,
|
dur_s=benchmark_duration,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
backend=backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
|
print("{:<40} {:<10}".format("Backend:", backend))
|
||||||
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
|
|||||||
Reference in New Issue
Block a user