add profile in offline benchmark & update doc (#2123)
Co-authored-by: root <bjmsong@126.com>
This commit is contained in:
@@ -56,3 +56,22 @@ with nvtx.annotate("description", color="color"):
|
||||
## Other tips
|
||||
|
||||
1. You can benchmark a model using dummy weights by only providing the config.json file. This allows for quick testing of model variants without training. To do so, add `--load-format dummy` to the above commands and then you only need a correct `config.json` under the checkpoint folder.
|
||||
|
||||
## Profile with PyTorch Profiler
|
||||
- To profile a server
|
||||
```bash
|
||||
# set trace path
|
||||
export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
|
||||
# start server
|
||||
python -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct
|
||||
|
||||
python -m sglang.bench_serving --backend sglang --model-path meta-llama/Llama-3.1-8B-Instruct --num-prompts 10 --profile
|
||||
```
|
||||
|
||||
Traces can be visualized using https://ui.perfetto.dev/.
|
||||
|
||||
- To profile offline
|
||||
```bash
|
||||
export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
|
||||
python -m sglang.bench_offline_throughput --model-path meta-llama/Llama-3.1-8B-Instruct --dataset-name random --num-prompts 10 --profile --mem-frac=0.8
|
||||
```
|
||||
|
||||
@@ -14,6 +14,7 @@ import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
@@ -27,7 +28,7 @@ from sglang.bench_serving import (
|
||||
sample_random_requests,
|
||||
set_ulimit,
|
||||
)
|
||||
from sglang.srt.server import Runtime
|
||||
from sglang.srt.server import Runtime, start_profile, stop_profile
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
|
||||
|
||||
@@ -52,6 +53,7 @@ class BenchArgs:
|
||||
seed: int = 1
|
||||
skip_warmup: bool = False
|
||||
do_not_exit: bool = False
|
||||
profile: bool = False
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
@@ -156,6 +158,12 @@ class BenchArgs:
|
||||
action="store_true",
|
||||
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
help="Use Torch Profiler. The endpoint must be launched with "
|
||||
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace):
|
||||
@@ -169,6 +177,7 @@ def throughput_test_once(
|
||||
reqs: List[Tuple[str, int, int]],
|
||||
ignore_eos: bool,
|
||||
extra_request_body: Dict,
|
||||
profile: bool,
|
||||
):
|
||||
measurement_results = {
|
||||
"backend": backend_name,
|
||||
@@ -194,7 +203,15 @@ def throughput_test_once(
|
||||
]
|
||||
|
||||
st = time.perf_counter()
|
||||
if profile:
|
||||
start_profile()
|
||||
|
||||
gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
|
||||
|
||||
if profile:
|
||||
stop_profile()
|
||||
monitor_trace_file(os.getenv("SGLANG_TORCH_PROFILER_DIR"))
|
||||
|
||||
latency = time.perf_counter() - st
|
||||
|
||||
if backend_name == "runtime":
|
||||
@@ -221,6 +238,41 @@ def throughput_test_once(
|
||||
return measurement_results
|
||||
|
||||
|
||||
def monitor_trace_file(directory, interval=1):
|
||||
|
||||
print(f"Monitoring {directory} for new trace files...")
|
||||
|
||||
known_files = set(os.listdir(directory))
|
||||
|
||||
while True:
|
||||
flag = False
|
||||
time.sleep(interval)
|
||||
current_files = set(os.listdir(directory))
|
||||
|
||||
new_files = current_files - known_files
|
||||
for new_file in new_files:
|
||||
new_file_path = os.path.join(directory, new_file)
|
||||
print(f"New file detected: {new_file}")
|
||||
|
||||
previous_size = 0
|
||||
while True:
|
||||
try:
|
||||
current_size = os.path.getsize(new_file_path)
|
||||
except FileNotFoundError:
|
||||
print(f"File {new_file} is no longer accessible.")
|
||||
break
|
||||
|
||||
if current_size > previous_size:
|
||||
previous_size = current_size
|
||||
else:
|
||||
flag = True
|
||||
break
|
||||
|
||||
time.sleep(interval)
|
||||
if flag:
|
||||
break
|
||||
|
||||
|
||||
def throughput_test(
|
||||
server_args: ServerArgs,
|
||||
bench_args: BenchArgs,
|
||||
@@ -268,6 +320,7 @@ def throughput_test(
|
||||
reqs=warmup_requests,
|
||||
ignore_eos=not bench_args.disable_ignore_eos,
|
||||
extra_request_body=extra_request_body,
|
||||
profile=False,
|
||||
)
|
||||
|
||||
logging.info("\nBenchmark...")
|
||||
@@ -277,6 +330,7 @@ def throughput_test(
|
||||
reqs=input_requests,
|
||||
ignore_eos=not bench_args.disable_ignore_eos,
|
||||
extra_request_body=extra_request_body,
|
||||
profile=bench_args.profile,
|
||||
)
|
||||
|
||||
if bench_args.result_filename:
|
||||
|
||||
@@ -169,9 +169,19 @@ async def flush_cache():
|
||||
)
|
||||
|
||||
|
||||
def start_profile():
|
||||
"""Start profiling."""
|
||||
tokenizer_manager.start_profile()
|
||||
|
||||
|
||||
def stop_profile():
|
||||
"""Stop profiling."""
|
||||
tokenizer_manager.stop_profile()
|
||||
|
||||
|
||||
@app.get("/start_profile")
|
||||
@app.post("/start_profile")
|
||||
async def start_profile():
|
||||
async def start_profile_async():
|
||||
"""Start profiling."""
|
||||
tokenizer_manager.start_profile()
|
||||
return Response(
|
||||
@@ -182,7 +192,7 @@ async def start_profile():
|
||||
|
||||
@app.get("/stop_profile")
|
||||
@app.post("/stop_profile")
|
||||
async def stop_profile():
|
||||
async def stop_profile_async():
|
||||
"""Stop profiling."""
|
||||
tokenizer_manager.stop_profile()
|
||||
return Response(
|
||||
|
||||
Reference in New Issue
Block a user