Fix GPU OOM (#6564)

Co-authored-by: michael <michael.zhang@amd.com>
2025-05-25 07:38:39 +08:00
parent 24c035f2e3
commit 7a5e6ce1cb
8 changed files with 15 additions and 16 deletions
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -62,7 +62,10 @@ class TestBenchOneBatch(CustomTestCase):
                f"### test_torch_compile_tp2_bs1 (Mixtral-8x7B)\n"
                f"output_throughput: {output_throughput:.2f} token/s\n"
            )
-            self.assertGreater(output_throughput, 220)
+            if os.getenv("SGLANG_AMD_CI") == "1":
+                self.assertGreater(output_throughput, 200)
+            else:
+                self.assertGreater(output_throughput, 220)


 if __name__ == "__main__":
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -32,7 +32,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 3500)
+                self.assertGreater(res["output_throughput"], 3150)
            else:
                self.assertGreater(res["output_throughput"], 3800)

@@ -70,7 +70,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 3500)
+                self.assertGreater(res["output_throughput"], 3050)
            else:
                self.assertGreater(res["output_throughput"], 3800)

@@ -126,7 +126,7 @@ class TestBenchServing(CustomTestCase):
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
            if os.getenv("SGLANG_AMD_CI") == "1":
-                self.assertGreater(res["output_throughput"], 4000)
+                self.assertGreater(res["output_throughput"], 3500)
            else:
                self.assertGreater(res["output_throughput"], 4300)

--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/srt/test_eval_accuracy_large.py
@@ -37,11 +37,6 @@ class TestEvalAccuracyLarge(CustomTestCase):
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)

-    def tearDown(self):
-        # Delay between tests to allow GPU memory cleanup
-        if os.getenv("SGLANG_AMD_CI") == "1":
-            time.sleep(180)
-
    def test_mmlu(self):
        args = SimpleNamespace(
            base_url=self.base_url,
--- a/test/srt/test_full_deepseek_v3.py
+++ b/test/srt/test_full_deepseek_v3.py
@@ -90,9 +90,9 @@ class TestDeepseekV3MTP(CustomTestCase):
            "2",
            "--speculative-num-draft-tokens",
            "4",
-            "--mem-fraction-static",
-            "0.7",
        ]
+        if os.environ.get("SGLANG_AMD_CI") != "1":
+            other_args += ["--mem-frac", "0.7"]
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,