Add perf tests for LoRA (#8314)

This commit is contained in:
Lifu Huang
2025-07-26 14:55:22 -07:00
committed by GitHub
parent b7094a5ef1
commit 5c705b1dce
3 changed files with 177 additions and 21 deletions

View File

@@ -174,6 +174,13 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
- name: Benchmark online latency (LoRA)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
performance-test-1-gpu-part-2:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false

View File

@@ -1,6 +1,7 @@
"""Common utilities for testing and benchmarking"""
import argparse
import asyncio
import copy
import json
import logging
@@ -15,7 +16,7 @@ from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from functools import partial
from types import SimpleNamespace
from typing import Callable, List, Optional, Tuple
from typing import Awaitable, Callable, List, Optional, Tuple
import numpy as np
import requests
@@ -714,6 +715,7 @@ def get_benchmark_args(
seed: int = 0,
device="auto",
pd_separated: bool = False,
lora_name=None,
):
return SimpleNamespace(
backend="sglang",
@@ -741,7 +743,7 @@ def get_benchmark_args(
extra_request_body=None,
apply_chat_template=False,
profile=None,
lora_name=None,
lora_name=lora_name,
prompt_suffix="",
device=device,
pd_separated=pd_separated,
@@ -764,6 +766,8 @@ def run_bench_serving(
need_warmup=False,
seed: int = 0,
device="auto",
background_task: Optional[Callable[[str, asyncio.Event], Awaitable[None]]] = None,
lora_name: Optional[str] = None,
):
if device == "auto":
device = auto_config_device()
@@ -791,14 +795,35 @@ def run_bench_serving(
disable_ignore_eos=disable_ignore_eos,
seed=seed,
device=device,
lora_name=lora_name,
)
try:
async def _run():
if need_warmup:
warmup_args = copy.deepcopy(args)
warmup_args.num_prompts = 16
run_benchmark(warmup_args)
res = run_benchmark(args)
await asyncio.to_thread(run_benchmark, warmup_args)
start_event = asyncio.Event()
stop_event = asyncio.Event()
task_handle = (
asyncio.create_task(background_task(base_url, start_event, stop_event))
if background_task
else None
)
try:
start_event.set()
result = await asyncio.to_thread(run_benchmark, args)
finally:
if task_handle:
stop_event.set()
await task_handle
return result
try:
res = asyncio.run(_run())
finally:
kill_process_tree(process.pid)

View File

@@ -1,4 +1,9 @@
import asyncio
import itertools
import unittest
from random import random, uniform
import requests
from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
@@ -16,7 +21,6 @@ from sglang.test.test_utils import (
class TestBenchServing(CustomTestCase):
def test_offline_throughput_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
@@ -28,7 +32,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3050)
@@ -51,7 +55,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_non_stream_small_batch_size\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 1050)
@@ -66,7 +70,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3050)
@@ -84,7 +88,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_without_chunked_prefill\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 2600)
@@ -104,7 +108,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3500)
@@ -122,7 +126,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 3500)
@@ -140,7 +144,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_default\n"
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 11000)
if is_in_amd_ci():
@@ -164,7 +168,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_vlm_offline_throughput\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 2000)
@@ -187,7 +191,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_vlm_online_latency\n"
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 16500)
if is_in_amd_ci():
@@ -197,6 +201,126 @@ class TestBenchServing(CustomTestCase):
self.assertLess(res["median_ttft_ms"], 100)
self.assertLess(res["median_itl_ms"], 8)
def test_lora_online_latency(self):
# TODO (lifuhuang): verify LoRA support in AMD.
if is_in_amd_ci():
pass
res = self._run_lora_latency_test(enable_background_task=False)
if is_in_ci():
write_github_step_summary(
f"### test_lora_online_latency\n"
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 2400)
self.assertLess(res["median_ttft_ms"], 58)
def test_lora_online_latency_with_concurrent_adapter_updates(self):
# TODO (lifuhuang): verify LoRA support in AMD.
if is_in_amd_ci():
pass
res = self._run_lora_latency_test(enable_background_task=True)
if is_in_ci():
write_github_step_summary(
f"### test_lora_online_latency\n"
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
)
self.assertLess(res["median_e2e_latency_ms"], 4000)
# TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR.
self.assertLess(res["median_ttft_ms"], 1600)
def _run_lora_latency_test(self, enable_background_task: bool):
"""
Run a latency test for LoRA with the specified background task setting.
"""
async def lora_loader_unloader_task(
base_url: str,
start_event: asyncio.Event,
stop_event: asyncio.Event,
):
"""
A background task that repeatedly loads and unloads a LoRA adapter.
"""
await start_event.wait()
path_cycler = itertools.cycle(
[
"pbevan11/llama-3.1-8b-ocr-correction",
"faridlazuarda/valadapt-llama-3.1-8B-it-chinese",
"philschmid/code-llama-3-1-8b-text-to-sql-lora",
]
)
load_url = f"{base_url}/load_lora_adapter"
unload_url = f"{base_url}/unload_lora_adapter"
num_updates = 0
while not stop_event.is_set():
# 1. Load the LoRA adapter
lora_path = next(path_cycler)
response = await asyncio.to_thread(
requests.post,
load_url,
json={"lora_name": lora_path, "lora_path": lora_path},
)
self.assertTrue(
response.ok, f"Failed to load LoRA adapter: {response.text}"
)
num_updates += 1
if stop_event.is_set():
break
# Yield control to allow other tasks to run.
await asyncio.sleep(1)
# 2. Unload the LoRA adapter
response = await asyncio.to_thread(
requests.post,
unload_url,
json={"lora_name": lora_path},
)
self.assertTrue(
response.ok, f"Failed to unload LoRA adapter: {response.text}"
)
num_updates += 1
# Yield control to allow other tasks to run.
await asyncio.sleep(1)
background_task = lora_loader_unloader_task if enable_background_task else None
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=400,
request_rate=8,
other_server_args=[
"--enable-lora",
"--max-loras-per-batch",
"1",
"--disable-radix-cache",
"--random-seed",
"42",
"--mem-fraction-static",
"0.8",
"--lora-paths",
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
"--max-lora-rank",
"256",
],
dataset_name="random",
random_input_len=256,
random_output_len=256,
lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
background_task=background_task,
)
return res
def test_online_latency_eagle(self):
res = run_bench_serving(
model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
@@ -226,8 +350,8 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_eagle\n"
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length: {res["accept_length"]:.2f} \n'
f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
f"accept_length: {res['accept_length']:.2f} \n"
)
if is_in_amd_ci():
self.assertLess(res["median_e2e_latency_ms"], 1800)
@@ -246,7 +370,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_moe_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 2100)
@@ -264,7 +388,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_moe_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
if is_in_amd_ci():
self.assertGreater(res["output_throughput"], 2100)
@@ -286,7 +410,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_pp_offline_throughput_default_decode\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
f"Output throughput: {res['output_throughput']:.2f} token/s\n"
)
self.assertGreater(res["output_throughput"], 6700)
@@ -311,7 +435,7 @@ class TestBenchServing(CustomTestCase):
if is_in_ci():
write_github_step_summary(
f"### test_pp_long_context_latency_prefill\n"
f'input_throughput: {res["input_throughput"]:.2f} ms\n'
f"input_throughput: {res['input_throughput']:.2f} ms\n"
)
self.assertGreater(res["input_throughput"], 4000)