diff --git a/test/srt/configs/llama_405b.yaml b/test/srt/configs/llama_405b.yaml new file mode 100644 index 000000000..bde85d724 --- /dev/null +++ b/test/srt/configs/llama_405b.yaml @@ -0,0 +1,28 @@ +tasks: + - name: sglang-8192-1024-concurrency1 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 1 --num-prompts 5 --output-file llama_405b_results.jsonl + + - name: sglang-8192-1024-concurrency2 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 2 --num-prompts 10 --output-file llama_405b_results.jsonl + + - name: sglang-8192-1024-concurrency4 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 4 --num-prompts 20 --output-file llama_405b_results.jsonl + + - name: sglang-8192-1024-concurrency8 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 8 --num-prompts 32 --output-file llama_405b_results.jsonl + + - name: sglang-8192-1024-concurrency16 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 16 --num-prompts 48 --output-file llama_405b_results.jsonl + + - name: sglang-8192-1024-concurrency24 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 24 --num-prompts 72 --output-file llama_405b_results.jsonl + + - name: sglang-8192-1024-concurrency32 + server_cmd: python3 -m sglang.launch_server --model nvidia/Llama-3.1-405B-Instruct-FP8 --tp 8 + client_cmd: python3 -m sglang.bench_serving --random-range-ratio 1 --random-input-len 8192 --random-output-len 1024 --max-concurrency 32 --num-prompts 96 --output-file llama_405b_results.jsonl diff --git a/test/srt/experiment_runner.py b/test/srt/experiment_runner.py index c4966dc77..7feeef1aa 100644 --- a/test/srt/experiment_runner.py +++ b/test/srt/experiment_runner.py @@ -317,6 +317,11 @@ def format_results(results: List[TaskResult]) -> str: return "\n".join(output) +def get_bool_env_var(name: str, default: str = "false") -> bool: + value = os.getenv(name, default) + return value.lower() in ("true", "1") + + def write_in_github_step_summary(results: List[TaskResult]): """Write formatted results to GitHub step summary.""" if not os.environ.get("GITHUB_STEP_SUMMARY"): @@ -349,7 +354,8 @@ def main(): result = runner.run_task(config) results.append(result) - write_in_github_step_summary(results) + if get_bool_env_var("SGLANG_IS_IN_CI"): + write_in_github_step_summary(results) except Exception as e: logger.error(f"Error: {e}") raise diff --git a/test/srt/parse_results.py b/test/srt/parse_results.py new file mode 100644 index 000000000..8389a4b9c --- /dev/null +++ b/test/srt/parse_results.py @@ -0,0 +1,46 @@ +import json +import pandas as pd +import argparse +import os +from tabulate import tabulate + +# Parse command-line arguments +parser = argparse.ArgumentParser(description="Parse JSONL benchmark and summarize.") +parser.add_argument("input_file", type=str, help="Path to input JSONL file") +args = parser.parse_args() + +input_file = args.input_file +base_name = os.path.splitext(os.path.basename(input_file))[0] +output_file = f"{base_name}_summary.csv" + +fields = [ + "max_concurrency", + "output_throughput", + "mean_ttft_ms", + "median_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "p99_tpot_ms", +] + +# Read JSONL and parse +results = [] +with open(input_file, "r") as f: + for line in f: + data = json.loads(line) + row = {field: data.get(field, None) for field in fields} + max_conc = data.get("max_concurrency") + out_tp = data.get("output_throughput") + row["per_user_throughput"] = out_tp / max_conc if max_conc else None + results.append(row) + +# Convert to DataFrame +df = pd.DataFrame(results) + +# Save to CSV +df.to_csv(output_file, index=False) +print(f"\nSaved summary to: {output_file}\n") + +# Print ASCII table +print(tabulate(df, headers="keys", tablefmt="grid", floatfmt=".3f"))