diff --git a/python/sglang/test/send_one.py b/python/sglang/test/send_one.py index 061a4d05b..d5a399f0a 100644 --- a/python/sglang/test/send_one.py +++ b/python/sglang/test/send_one.py @@ -12,6 +12,7 @@ import dataclasses import json import requests +import tabulate from sglang.profiler import run_profile @@ -141,12 +142,16 @@ def send_one_prompt(args): ) if args.stream: + last_len = 0 for chunk in response.iter_lines(decode_unicode=False): chunk = chunk.decode("utf-8") if chunk and chunk.startswith("data:"): if chunk == "data: [DONE]": break ret = json.loads(chunk[5:].strip("\n")) + chunk_str = ret["text"][last_len:] + last_len = len(ret["text"]) + print(chunk_str, end="", flush=True) else: ret = response.json() @@ -157,21 +162,25 @@ def send_one_prompt(args): print(ret) return 0, 0 - latency = ret["meta_info"]["e2e_latency"] - - if "spec_verify_ct" in ret["meta_info"]: + if "spec_verify_ct" in ret["meta_info"] and ret["meta_info"]["spec_verify_ct"] > 0: acc_length = ( ret["meta_info"]["completion_tokens"] / ret["meta_info"]["spec_verify_ct"] ) else: acc_length = 1.0 + latency = ret["meta_info"]["e2e_latency"] speed = ret["meta_info"]["completion_tokens"] / latency + tokens = ret["meta_info"]["completion_tokens"] + + if not args.stream: + print(ret["text"]) - print(ret["text"]) print() - print(f"{acc_length=:.2f}") - print(f"{speed=:.2f} token/s") + headers = ["Latency (s)", "Tokens", "Acc Length", "Speed (token/s)"] + rows = [[f"{latency:.3f}", f"{tokens}", f"{acc_length:.3f}", f"{speed:.2f}"]] + msg = tabulate.tabulate(rows, headers=headers, tablefmt="pretty") + print(msg) return acc_length, speed