Fix GPU OOM (#6564)

Co-authored-by: michael <michael.zhang@amd.com>
2025-05-25 07:38:39 +08:00
parent 24c035f2e3
commit 7a5e6ce1cb
8 changed files with 15 additions and 16 deletions
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -138,11 +138,6 @@ jobs:
        run: |
          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
      - name: Benchmark online latency (EAGLE)
        timeout-minutes: 15
        run: |
          bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
  performance-test-1-gpu-part-2-amd:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -506,6 +506,7 @@ class AiterIndicesUpdaterPrefill:
                spec_info.generate_attn_arg_prefill(
                    req_pool_indices,
                    paged_kernel_lens,
                    None,
                    self.req_to_token,
                )
            )
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -412,6 +412,10 @@ class ModelRunner:
        if not server_args.disable_chunked_prefix_cache:
            logger.info("Chunked prefix cache is turned on.")
        if server_args.attention_backend == "aiter":
            if self.model_config.context_len > 8192:
                self.mem_fraction_static *= 0.85
    def init_torch_distributed(self):
        logger.info("Init torch distributed begin.")
--- a/scripts/amd_ci_install_dependency.sh
+++ b/scripts/amd_ci_install_dependency.sh
@@ -5,6 +5,7 @@ set -euo pipefail
 docker exec ci_sglang pip install --upgrade pip
 docker exec ci_sglang pip uninstall sgl-kernel -y || true
 docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
 docker exec ci_sglang pip install -e "python[dev_hip]"
 docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
 docker exec -w /human-eval ci_sglang pip install -e .
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -62,7 +62,10 @@ class TestBenchOneBatch(CustomTestCase):
                f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
                f"output_throughput: {output_throughput:.2f} token/s\n"
            )
-            self.assertGreater(output_throughput, 220)
+            if os.getenv("SGLANG_AMD_CI") == "1":
                self.assertGreater(output_throughput, 200)
            else:
                self.assertGreater(output_throughput, 220)
 if __name__ == "__main__":
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -32,7 +32,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 3500)
+                self.assertGreater(res["output_throughput"], 3150)
            else:
                self.assertGreater(res["output_throughput"], 3800)
@@ -70,7 +70,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 3500)
+                self.assertGreater(res["output_throughput"], 3050)
            else:
                self.assertGreater(res["output_throughput"], 3800)
@@ -126,7 +126,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 4000)
+                self.assertGreater(res["output_throughput"], 3500)
            else:
                self.assertGreater(res["output_throughput"], 4300)
--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -37,11 +37,6 @@ class TestEvalAccuracyLarge(CustomTestCase):
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)
    def tearDown(self):
        # Delay between tests to allow GPU memory cleanup
        if os.getenv("SGLANG_AMD_CI") == "1":
            time.sleep(180)
    def test_mmlu(self):
        args = SimpleNamespace(
            base_url=self.base_url,
--- a/test/srt/test_full_deepseek_v3.py
+++ b/test/srt/test_full_deepseek_v3.py
@@ -90,9 +90,9 @@ class TestDeepseekV3MTP(CustomTestCase):
            "2",
            "--speculative-num-draft-tokens",
            "4",
            "--mem-fraction-static",
            "0.7",
        ]
        if os.environ.get("SGLANG_AMD_CI") != "1":
            other_args += ["--mem-frac", "0.7"]
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,