[Hicache] Evaluate Per-Round Metrics in Multiturn Bench (#10203)

Co-authored-by: Teng Ma <sima.mt@alibaba-inc.com>
This commit is contained in:
ykwd
2025-09-16 10:34:40 +08:00
committed by GitHub
parent ec272dda9c
commit 4bb08f6e07
2 changed files with 102 additions and 14 deletions

View File

@@ -105,12 +105,16 @@ def parse_args():
action="store_true",
help="If set, disable automatically testing with a range of request rates.",
)
parser.add_argument(
"--disable-random-sample",
action="store_true",
help="If set, disable random sampling of requests from the ShareGPT dataset.",
)
parser.add_argument(
"--enable-round-barrier",
action="store_true",
help="If set, only send i-th turn requests after all (i-1)-th turn requests finished.",
)
parser.add_argument(
"--sub-question-input-length",
type=int,
@@ -335,6 +339,19 @@ class WorkloadGenerator:
"cached_tokens": [],
"generated_len": [],
}
self.enable_round_barrier = args.enable_round_barrier
if self.enable_round_barrier:
# Add round-specific metrics while preserving the original structure
for i in range(args.num_rounds):
self.performance_metrics[f"round_{i}"] = {
"ttft": [],
"latency": [],
"prompt_len": [],
"cached_tokens": [],
"generated_len": [],
}
self.num_clients = args.num_clients
self.num_rounds = args.num_rounds
self.max_parallel = args.max_parallel
self.output_length = args.output_length
@@ -383,6 +400,7 @@ class WorkloadGenerator:
loop.close()
def response_handler(self):
next_round_reqs = []
while True:
try:
client_id, response = self.response_queue.get(
@@ -391,12 +409,29 @@ class WorkloadGenerator:
if not response.success:
raise ValueError(f"Request failed with error: {response.error}")
self.client_records[client_id]["history"] += response.generated_text
current_round = self.client_records[client_id]["round"]
self.client_records[client_id]["round"] += 1
self.performance_metrics["ttft"].append(response.ttft)
self.performance_metrics["latency"].append(response.latency)
self.performance_metrics["prompt_len"].append(response.prompt_len)
self.performance_metrics["cached_tokens"].append(response.cached_tokens)
self.performance_metrics["generated_len"].append(response.generated_len)
if self.enable_round_barrier:
self.performance_metrics[f"round_{current_round}"]["ttft"].append(
response.ttft
)
self.performance_metrics[f"round_{current_round}"][
"latency"
].append(response.latency)
self.performance_metrics[f"round_{current_round}"][
"prompt_len"
].append(response.prompt_len)
self.performance_metrics[f"round_{current_round}"][
"cached_tokens"
].append(response.cached_tokens)
self.performance_metrics[f"round_{current_round}"][
"generated_len"
].append(response.generated_len)
self.completed_requests += 1
if self.client_records[client_id]["round"] < self.num_rounds:
@@ -404,16 +439,22 @@ class WorkloadGenerator:
self.client_records[client_id][
"history"
] += self.sub_question_inputs.pop().prompt
self.ready_queue.append(
(
client_id,
gen_payload(
self.client_records[client_id]["history"],
self.output_length,
args.lora_path,
),
)
new_req = (
client_id,
gen_payload(
self.client_records[client_id]["history"],
self.output_length,
args.lora_path,
),
)
if self.enable_round_barrier:
next_round_reqs.append(new_req)
if len(next_round_reqs) == self.num_clients:
for req in next_round_reqs:
self.ready_queue.append(req)
next_round_reqs = []
else:
self.ready_queue.append(new_req)
except queue.Empty:
if self.pbar.n == self.pbar.total:
break
@@ -469,6 +510,25 @@ class WorkloadGenerator:
),
},
}
if self.enable_round_barrier:
performance_data["round"] = {}
for round_num in range(args.num_rounds):
round_key = f"round_{round_num}"
round_metrics = self.performance_metrics[round_key]
performance_data["round"][round_key] = {
"average_ttft": (
sum(round_metrics["ttft"]) / len(round_metrics["ttft"])
if round_metrics["ttft"]
else 0
),
"cache_hit_rate": (
0
if sum(round_metrics["prompt_len"]) == 0
else sum(round_metrics["cached_tokens"])
/ sum(round_metrics["prompt_len"])
),
"request_count": len(round_metrics["ttft"]),
}
print("All requests completed")
print("Performance metrics summary:")
print(
@@ -492,6 +552,26 @@ class WorkloadGenerator:
f" Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
)
print(f" Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
if self.enable_round_barrier:
# Print round-basedsummary
print("Per-round metrics:")
if "round" in performance_data:
for round_num in range(self.num_rounds):
round_key = f"round_{round_num}"
if round_key in performance_data["round"]:
round_data = performance_data["round"][round_key]
avg_ttft = round_data["average_ttft"]
cache_hit_rate = round_data["cache_hit_rate"]
request_count = round_data["request_count"]
print(
f" Round {round_num}: Average TTFT = {avg_ttft:.2f}s, "
f"Cache Hit Rate = {cache_hit_rate:.6f} "
f"({request_count} requests)"
)
else:
print(f" Round {round_num}: No requests completed")
return performance_data