feat: add e2e latency (#704)
This commit is contained in:
@@ -264,6 +264,8 @@ class BenchmarkMetrics:
|
|||||||
median_itl_ms: float
|
median_itl_ms: float
|
||||||
std_itl_ms: float
|
std_itl_ms: float
|
||||||
p99_itl_ms: float
|
p99_itl_ms: float
|
||||||
|
mean_e2e_latency_ms: float
|
||||||
|
median_e2e_latency_ms: float
|
||||||
|
|
||||||
|
|
||||||
default_sharegpt_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
|
default_sharegpt_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
|
||||||
@@ -467,6 +469,7 @@ def calculate_metrics(
|
|||||||
itls: List[float] = []
|
itls: List[float] = []
|
||||||
tpots: List[float] = []
|
tpots: List[float] = []
|
||||||
ttfts: List[float] = []
|
ttfts: List[float] = []
|
||||||
|
e2e_latencies: List[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
output_len = outputs[i].output_len
|
output_len = outputs[i].output_len
|
||||||
@@ -480,6 +483,9 @@ def calculate_metrics(
|
|||||||
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
||||||
itls += outputs[i].itl
|
itls += outputs[i].itl
|
||||||
ttfts.append(outputs[i].ttft)
|
ttfts.append(outputs[i].ttft)
|
||||||
|
|
||||||
|
e2e_latencies.append(outputs[i].latency)
|
||||||
|
|
||||||
completed += 1
|
completed += 1
|
||||||
else:
|
else:
|
||||||
output_lens.append(0)
|
output_lens.append(0)
|
||||||
@@ -513,6 +519,8 @@ def calculate_metrics(
|
|||||||
median_itl_ms=np.median(itls or 0) * 1000,
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
std_itl_ms=np.std(itls or 0) * 1000,
|
std_itl_ms=np.std(itls or 0) * 1000,
|
||||||
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||||
|
mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
|
||||||
|
median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
return metrics, output_lens
|
return metrics, output_lens
|
||||||
@@ -611,6 +619,15 @@ async def benchmark(
|
|||||||
"Output token throughput (tok/s):", metrics.output_throughput
|
"Output token throughput (tok/s):", metrics.output_throughput
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
||||||
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
"Median E2E Latency (ms):", metrics.median_e2e_latency_ms
|
||||||
|
)
|
||||||
|
)
|
||||||
print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
|
print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
|
||||||
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
|
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
|
||||||
print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
|
print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
|
||||||
@@ -639,6 +656,8 @@ async def benchmark(
|
|||||||
"total_input": metrics.total_input,
|
"total_input": metrics.total_input,
|
||||||
"total_output": metrics.total_output,
|
"total_output": metrics.total_output,
|
||||||
"total_output_retokenized": metrics.total_output_retokenized,
|
"total_output_retokenized": metrics.total_output_retokenized,
|
||||||
|
"mean_e2e_latency": metrics.mean_e2e_latency_ms,
|
||||||
|
"median_e2e_latency": metrics.median_e2e_latency_ms,
|
||||||
"median_ttft": metrics.median_ttft_ms,
|
"median_ttft": metrics.median_ttft_ms,
|
||||||
"median_itl": metrics.median_itl_ms,
|
"median_itl": metrics.median_itl_ms,
|
||||||
"output_token_throughput": metrics.output_throughput,
|
"output_token_throughput": metrics.output_throughput,
|
||||||
@@ -693,6 +712,8 @@ async def benchmark(
|
|||||||
"itls": [output.itl for output in outputs],
|
"itls": [output.itl for output in outputs],
|
||||||
"generated_texts": [output.generated_text for output in outputs],
|
"generated_texts": [output.generated_text for output in outputs],
|
||||||
"errors": [output.error for output in outputs],
|
"errors": [output.error for output in outputs],
|
||||||
|
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
||||||
|
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user