Add 1 gpu perf and 2 gpu accuracy tests for AMD MI300x CI. (#5960)

This commit is contained in:
Sai Enduri
2025-05-01 20:56:59 -07:00
committed by GitHub
parent c5645e928f
commit 73bc1d00fc
2 changed files with 171 additions and 11 deletions

View File

@@ -29,7 +29,10 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3800)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 3500)
else:
self.assertGreater(res["output_throughput"], 3800)
def test_offline_throughput_non_stream_small_batch_size(self):
res = run_bench_serving(
@@ -64,7 +67,10 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3800)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 3500)
else:
self.assertGreater(res["output_throughput"], 3800)
def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving(
@@ -99,7 +105,10 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 3700)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 3500)
else:
self.assertGreater(res["output_throughput"], 3700)
def test_offline_throughput_default_fp8(self):
res = run_bench_serving(
@@ -114,7 +123,10 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
)
self.assertGreater(res["output_throughput"], 4300)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertGreater(res["output_throughput"], 4000)
else:
self.assertGreater(res["output_throughput"], 4300)
def test_online_latency_default(self):
res = run_bench_serving(
@@ -130,7 +142,10 @@ class TestBenchServing(CustomTestCase):
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
)
self.assertLess(res["median_e2e_latency_ms"], 11000)
self.assertLess(res["median_ttft_ms"], 86)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertLess(res["median_ttft_ms"], 115)
else:
self.assertLess(res["median_ttft_ms"], 86)
self.assertLess(res["median_itl_ms"], 10)
def test_online_latency_eagle(self):
@@ -165,7 +180,10 @@ class TestBenchServing(CustomTestCase):
f'median_e2e_latency_ms: {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length: {res["accept_length"]:.2f} \n'
)
self.assertLess(res["median_e2e_latency_ms"], 900)
if os.getenv("SGLANG_AMD_CI") == "1":
self.assertLess(res["median_e2e_latency_ms"], 1450)
else:
self.assertLess(res["median_e2e_latency_ms"], 900)
self.assertGreater(res["accept_length"], 3.0)
def test_moe_offline_throughput_default(self):