5
.github/workflows/pr-test-amd.yml
vendored
5
.github/workflows/pr-test-amd.yml
vendored
@@ -138,11 +138,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
|
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
|
||||||
|
|
||||||
- name: Benchmark online latency (EAGLE)
|
|
||||||
timeout-minutes: 15
|
|
||||||
run: |
|
|
||||||
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
|
|
||||||
|
|
||||||
performance-test-1-gpu-part-2-amd:
|
performance-test-1-gpu-part-2-amd:
|
||||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||||
github.event.pull_request.draft == false
|
github.event.pull_request.draft == false
|
||||||
|
|||||||
@@ -506,6 +506,7 @@ class AiterIndicesUpdaterPrefill:
|
|||||||
spec_info.generate_attn_arg_prefill(
|
spec_info.generate_attn_arg_prefill(
|
||||||
req_pool_indices,
|
req_pool_indices,
|
||||||
paged_kernel_lens,
|
paged_kernel_lens,
|
||||||
|
None,
|
||||||
self.req_to_token,
|
self.req_to_token,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -412,6 +412,10 @@ class ModelRunner:
|
|||||||
if not server_args.disable_chunked_prefix_cache:
|
if not server_args.disable_chunked_prefix_cache:
|
||||||
logger.info("Chunked prefix cache is turned on.")
|
logger.info("Chunked prefix cache is turned on.")
|
||||||
|
|
||||||
|
if server_args.attention_backend == "aiter":
|
||||||
|
if self.model_config.context_len > 8192:
|
||||||
|
self.mem_fraction_static *= 0.85
|
||||||
|
|
||||||
def init_torch_distributed(self):
|
def init_torch_distributed(self):
|
||||||
logger.info("Init torch distributed begin.")
|
logger.info("Init torch distributed begin.")
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ set -euo pipefail
|
|||||||
docker exec ci_sglang pip install --upgrade pip
|
docker exec ci_sglang pip install --upgrade pip
|
||||||
docker exec ci_sglang pip uninstall sgl-kernel -y || true
|
docker exec ci_sglang pip uninstall sgl-kernel -y || true
|
||||||
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
|
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
|
||||||
|
docker exec ci_sglang pip install -e "python[dev_hip]"
|
||||||
|
|
||||||
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
|
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
|
||||||
docker exec -w /human-eval ci_sglang pip install -e .
|
docker exec -w /human-eval ci_sglang pip install -e .
|
||||||
|
|||||||
@@ -62,7 +62,10 @@ class TestBenchOneBatch(CustomTestCase):
|
|||||||
f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
|
f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
|
||||||
f"output_throughput: {output_throughput:.2f} token/s\n"
|
f"output_throughput: {output_throughput:.2f} token/s\n"
|
||||||
)
|
)
|
||||||
self.assertGreater(output_throughput, 220)
|
if os.getenv("SGLANG_AMD_CI") == "1":
|
||||||
|
self.assertGreater(output_throughput, 200)
|
||||||
|
else:
|
||||||
|
self.assertGreater(output_throughput, 220)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class TestBenchServing(CustomTestCase):
|
|||||||
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
|
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
|
||||||
)
|
)
|
||||||
if os.getenv("SGLANG_AMD_CI") == "1":
|
if os.getenv("SGLANG_AMD_CI") == "1":
|
||||||
self.assertGreater(res["output_throughput"], 3500)
|
self.assertGreater(res["output_throughput"], 3150)
|
||||||
else:
|
else:
|
||||||
self.assertGreater(res["output_throughput"], 3800)
|
self.assertGreater(res["output_throughput"], 3800)
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ class TestBenchServing(CustomTestCase):
|
|||||||
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
|
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
|
||||||
)
|
)
|
||||||
if os.getenv("SGLANG_AMD_CI") == "1":
|
if os.getenv("SGLANG_AMD_CI") == "1":
|
||||||
self.assertGreater(res["output_throughput"], 3500)
|
self.assertGreater(res["output_throughput"], 3050)
|
||||||
else:
|
else:
|
||||||
self.assertGreater(res["output_throughput"], 3800)
|
self.assertGreater(res["output_throughput"], 3800)
|
||||||
|
|
||||||
@@ -126,7 +126,7 @@ class TestBenchServing(CustomTestCase):
|
|||||||
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
|
f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
|
||||||
)
|
)
|
||||||
if os.getenv("SGLANG_AMD_CI") == "1":
|
if os.getenv("SGLANG_AMD_CI") == "1":
|
||||||
self.assertGreater(res["output_throughput"], 4000)
|
self.assertGreater(res["output_throughput"], 3500)
|
||||||
else:
|
else:
|
||||||
self.assertGreater(res["output_throughput"], 4300)
|
self.assertGreater(res["output_throughput"], 4300)
|
||||||
|
|
||||||
|
|||||||
@@ -37,11 +37,6 @@ class TestEvalAccuracyLarge(CustomTestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
kill_process_tree(cls.process.pid)
|
kill_process_tree(cls.process.pid)
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
# Delay between tests to allow GPU memory cleanup
|
|
||||||
if os.getenv("SGLANG_AMD_CI") == "1":
|
|
||||||
time.sleep(180)
|
|
||||||
|
|
||||||
def test_mmlu(self):
|
def test_mmlu(self):
|
||||||
args = SimpleNamespace(
|
args = SimpleNamespace(
|
||||||
base_url=self.base_url,
|
base_url=self.base_url,
|
||||||
|
|||||||
@@ -90,9 +90,9 @@ class TestDeepseekV3MTP(CustomTestCase):
|
|||||||
"2",
|
"2",
|
||||||
"--speculative-num-draft-tokens",
|
"--speculative-num-draft-tokens",
|
||||||
"4",
|
"4",
|
||||||
"--mem-fraction-static",
|
|
||||||
"0.7",
|
|
||||||
]
|
]
|
||||||
|
if os.environ.get("SGLANG_AMD_CI") != "1":
|
||||||
|
other_args += ["--mem-frac", "0.7"]
|
||||||
cls.process = popen_launch_server(
|
cls.process = popen_launch_server(
|
||||||
cls.model,
|
cls.model,
|
||||||
cls.base_url,
|
cls.base_url,
|
||||||
|
|||||||
Reference in New Issue
Block a user