From 9ecb18568b0273c4de84ee801129feaae8803d0a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 8 Jun 2025 17:20:46 -0700 Subject: [PATCH] Fix triton sliding window test case (#6981) --- test/srt/test_triton_sliding_window.py | 33 ++++++++++++-------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/test/srt/test_triton_sliding_window.py b/test/srt/test_triton_sliding_window.py index 865bc80e1..f2b422c5e 100644 --- a/test/srt/test_triton_sliding_window.py +++ b/test/srt/test_triton_sliding_window.py @@ -1,4 +1,3 @@ -import time import unittest from types import SimpleNamespace @@ -10,6 +9,7 @@ from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, + is_in_ci, popen_launch_server, ) @@ -45,10 +45,6 @@ class TestSlidingWindowAttentionTriton(CustomTestCase): ) cls.long_context_prompt += "\nNow, summarize the story in one sentence:" - @classmethod - def tearDownClass(cls): - pass - def _test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, @@ -61,7 +57,7 @@ class TestSlidingWindowAttentionTriton(CustomTestCase): metrics = run_eval(args) print(f"MMLU metrics with sliding window: {metrics}") - self.assertGreaterEqual(metrics["score"], 0.61) + self.assertGreaterEqual(metrics["score"], 0.60) def _test_short_context_generation(self): response = requests.post( @@ -97,6 +93,7 @@ class TestSlidingWindowAttentionTriton(CustomTestCase): self.assertGreater(len(result["text"].strip()), 0) print(f"Long context generation result: {result['text'][:100]}...") + @unittest.skipIf(is_in_ci(), "To reduce the CI execution time.") def test_no_cuda_graph(self): self.no_cuda_graph_process = popen_launch_server( self.model, @@ -105,12 +102,12 @@ class TestSlidingWindowAttentionTriton(CustomTestCase): other_args=self.common_args + ["--disable-cuda-graph"], ) - self._test_short_context_generation() - self._test_long_context_generation() - self._test_mmlu() - - kill_process_tree(self.no_cuda_graph_process.pid) - time.sleep(5) + try: + self._test_short_context_generation() + self._test_long_context_generation() + self._test_mmlu() + finally: + kill_process_tree(self.no_cuda_graph_process.pid) def test_cuda_graph(self): self.cuda_graph_process = popen_launch_server( @@ -120,12 +117,12 @@ class TestSlidingWindowAttentionTriton(CustomTestCase): other_args=self.common_args, ) - self._test_short_context_generation() - self._test_long_context_generation() - self._test_mmlu() - - kill_process_tree(self.cuda_graph_process.pid) - time.sleep(5) + try: + self._test_short_context_generation() + self._test_long_context_generation() + self._test_mmlu() + finally: + kill_process_tree(self.cuda_graph_process.pid) if __name__ == "__main__":