[Hicache] Evaluate Per-Round Metrics in Multiturn Bench (#10203)
Co-authored-by: Teng Ma <sima.mt@alibaba-inc.com>
This commit is contained in:
@@ -105,12 +105,16 @@ def parse_args():
|
||||
action="store_true",
|
||||
help="If set, disable automatically testing with a range of request rates.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable-random-sample",
|
||||
action="store_true",
|
||||
help="If set, disable random sampling of requests from the ShareGPT dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-round-barrier",
|
||||
action="store_true",
|
||||
help="If set, only send i-th turn requests after all (i-1)-th turn requests finished.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sub-question-input-length",
|
||||
type=int,
|
||||
@@ -335,6 +339,19 @@ class WorkloadGenerator:
|
||||
"cached_tokens": [],
|
||||
"generated_len": [],
|
||||
}
|
||||
self.enable_round_barrier = args.enable_round_barrier
|
||||
if self.enable_round_barrier:
|
||||
# Add round-specific metrics while preserving the original structure
|
||||
for i in range(args.num_rounds):
|
||||
self.performance_metrics[f"round_{i}"] = {
|
||||
"ttft": [],
|
||||
"latency": [],
|
||||
"prompt_len": [],
|
||||
"cached_tokens": [],
|
||||
"generated_len": [],
|
||||
}
|
||||
self.num_clients = args.num_clients
|
||||
|
||||
self.num_rounds = args.num_rounds
|
||||
self.max_parallel = args.max_parallel
|
||||
self.output_length = args.output_length
|
||||
@@ -383,6 +400,7 @@ class WorkloadGenerator:
|
||||
loop.close()
|
||||
|
||||
def response_handler(self):
|
||||
next_round_reqs = []
|
||||
while True:
|
||||
try:
|
||||
client_id, response = self.response_queue.get(
|
||||
@@ -391,12 +409,29 @@ class WorkloadGenerator:
|
||||
if not response.success:
|
||||
raise ValueError(f"Request failed with error: {response.error}")
|
||||
self.client_records[client_id]["history"] += response.generated_text
|
||||
current_round = self.client_records[client_id]["round"]
|
||||
self.client_records[client_id]["round"] += 1
|
||||
self.performance_metrics["ttft"].append(response.ttft)
|
||||
self.performance_metrics["latency"].append(response.latency)
|
||||
self.performance_metrics["prompt_len"].append(response.prompt_len)
|
||||
self.performance_metrics["cached_tokens"].append(response.cached_tokens)
|
||||
self.performance_metrics["generated_len"].append(response.generated_len)
|
||||
if self.enable_round_barrier:
|
||||
self.performance_metrics[f"round_{current_round}"]["ttft"].append(
|
||||
response.ttft
|
||||
)
|
||||
self.performance_metrics[f"round_{current_round}"][
|
||||
"latency"
|
||||
].append(response.latency)
|
||||
self.performance_metrics[f"round_{current_round}"][
|
||||
"prompt_len"
|
||||
].append(response.prompt_len)
|
||||
self.performance_metrics[f"round_{current_round}"][
|
||||
"cached_tokens"
|
||||
].append(response.cached_tokens)
|
||||
self.performance_metrics[f"round_{current_round}"][
|
||||
"generated_len"
|
||||
].append(response.generated_len)
|
||||
self.completed_requests += 1
|
||||
|
||||
if self.client_records[client_id]["round"] < self.num_rounds:
|
||||
@@ -404,16 +439,22 @@ class WorkloadGenerator:
|
||||
self.client_records[client_id][
|
||||
"history"
|
||||
] += self.sub_question_inputs.pop().prompt
|
||||
self.ready_queue.append(
|
||||
(
|
||||
client_id,
|
||||
gen_payload(
|
||||
self.client_records[client_id]["history"],
|
||||
self.output_length,
|
||||
args.lora_path,
|
||||
),
|
||||
)
|
||||
new_req = (
|
||||
client_id,
|
||||
gen_payload(
|
||||
self.client_records[client_id]["history"],
|
||||
self.output_length,
|
||||
args.lora_path,
|
||||
),
|
||||
)
|
||||
if self.enable_round_barrier:
|
||||
next_round_reqs.append(new_req)
|
||||
if len(next_round_reqs) == self.num_clients:
|
||||
for req in next_round_reqs:
|
||||
self.ready_queue.append(req)
|
||||
next_round_reqs = []
|
||||
else:
|
||||
self.ready_queue.append(new_req)
|
||||
except queue.Empty:
|
||||
if self.pbar.n == self.pbar.total:
|
||||
break
|
||||
@@ -469,6 +510,25 @@ class WorkloadGenerator:
|
||||
),
|
||||
},
|
||||
}
|
||||
if self.enable_round_barrier:
|
||||
performance_data["round"] = {}
|
||||
for round_num in range(args.num_rounds):
|
||||
round_key = f"round_{round_num}"
|
||||
round_metrics = self.performance_metrics[round_key]
|
||||
performance_data["round"][round_key] = {
|
||||
"average_ttft": (
|
||||
sum(round_metrics["ttft"]) / len(round_metrics["ttft"])
|
||||
if round_metrics["ttft"]
|
||||
else 0
|
||||
),
|
||||
"cache_hit_rate": (
|
||||
0
|
||||
if sum(round_metrics["prompt_len"]) == 0
|
||||
else sum(round_metrics["cached_tokens"])
|
||||
/ sum(round_metrics["prompt_len"])
|
||||
),
|
||||
"request_count": len(round_metrics["ttft"]),
|
||||
}
|
||||
print("All requests completed")
|
||||
print("Performance metrics summary:")
|
||||
print(
|
||||
@@ -492,6 +552,26 @@ class WorkloadGenerator:
|
||||
f" Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
|
||||
)
|
||||
print(f" Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
|
||||
|
||||
if self.enable_round_barrier:
|
||||
# Print round-basedsummary
|
||||
print("Per-round metrics:")
|
||||
if "round" in performance_data:
|
||||
for round_num in range(self.num_rounds):
|
||||
round_key = f"round_{round_num}"
|
||||
if round_key in performance_data["round"]:
|
||||
round_data = performance_data["round"][round_key]
|
||||
avg_ttft = round_data["average_ttft"]
|
||||
cache_hit_rate = round_data["cache_hit_rate"]
|
||||
request_count = round_data["request_count"]
|
||||
print(
|
||||
f" Round {round_num}: Average TTFT = {avg_ttft:.2f}s, "
|
||||
f"Cache Hit Rate = {cache_hit_rate:.6f} "
|
||||
f"({request_count} requests)"
|
||||
)
|
||||
else:
|
||||
print(f" Round {round_num}: No requests completed")
|
||||
|
||||
return performance_data
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user