diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 7becf7181..a9ec66794 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -33,7 +33,7 @@ jobs: pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Run test - timeout-minutes: 20 + timeout-minutes: 10 run: | cd test/lang python3 run_suite.py --suite minimal @@ -73,7 +73,7 @@ jobs: pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Run test - timeout-minutes: 30 + timeout-minutes: 20 run: | cd test/srt python3 run_suite.py --suite minimal --range-begin 5 --range-end 17 @@ -93,10 +93,30 @@ jobs: pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Run test - timeout-minutes: 30 + timeout-minutes: 20 run: | cd test/srt - python3 run_suite.py --suite minimal --range-begin 17 + python3 run_suite.py --suite minimal --range-begin 17 --range-end 20 + + unit-test-backend-part-4: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[dev]" + pip install transformers==4.45.2 + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Run test + timeout-minutes: 20 + run: | + cd test/srt + python3 run_suite.py --suite minimal --range-begin 20 performance-test-1-gpu-part-1: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' @@ -263,7 +283,7 @@ jobs: finish: needs: [ - unit-test-frontend, unit-test-backend-part-1, unit-test-backend-part-2, unit-test-backend-part-3, + unit-test-frontend, unit-test-backend-part-1, unit-test-backend-part-2, unit-test-backend-part-3, unit-test-backend-part-4, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu ] diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 8fb20c6eb..20fc9d52d 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -3,6 +3,7 @@ import argparse import asyncio import os +import random import subprocess import threading import time @@ -20,6 +21,7 @@ from sglang.global_config import global_config from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" @@ -400,7 +402,7 @@ def popen_launch_server( api_key: Optional[str] = None, other_args: tuple = (), env: Optional[dict] = None, - return_stdout_stderr: bool = False, + return_stdout_stderr: Optional[tuple] = None, ): _, host, port = base_url.split(":") host = host[2:] @@ -423,8 +425,8 @@ def popen_launch_server( if return_stdout_stderr: process = subprocess.Popen( command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stdout=return_stdout_stderr[0], + stderr=return_stdout_stderr[1], env=env, text=True, ) @@ -631,3 +633,91 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2): rouge_l_scores.append(fmeasure) return rouge_l_scores + + +STDOUT_FILENAME = "stdout.txt" +STDERR_FILENAME = "stderr.txt" + + +def read_output(output_lines): + pt = 0 + while pt >= 0: + if pt > 0 and os.path.exists(STDERR_FILENAME): + break + lines = open(STDERR_FILENAME).readlines() + output_lines[:] = lines + for line in lines[pt:]: + print(line, end="", flush=True) + pt += 1 + + +def run_mmlu_test( + disable_radix_cache, + enable_mixed_chunk=False, + enable_overlap=False, + chunked_prefill_size=32, +): + other_args = ["--chunked-prefill-size", str(chunked_prefill_size)] + if disable_radix_cache: + other_args += ["--disable-radix-cache"] + if enable_mixed_chunk: + other_args += ["--enable-mixed-chunk"] + if enable_overlap: + other_args += ["--enable-overlap-scheduler"] + + model = DEFAULT_MODEL_NAME_FOR_TEST + port = random.randint(4000, 5000) + base_url = f"http://127.0.0.1:{port}" + + # Create files and launch the server + stdout = open(STDOUT_FILENAME, "w") + stderr = open(STDERR_FILENAME, "w") + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + return_stdout_stderr=(stdout, stderr), + ) + + # Launch a thread to stream the output + output_lines = [] + t = threading.Thread(target=read_output, args=(output_lines,)) + t.start() + + # Run the eval + args = SimpleNamespace( + base_url=base_url, + model=model, + eval_name="mmlu", + num_examples=128, + num_threads=128, + ) + + try: + metrics = run_eval(args) + print(f"{metrics=}") + assert metrics["score"] >= 0.65 + finally: + pass + + # Clean up everything + kill_child_process(process.pid) + kill_child_process(process.pid) + stdout.close() + stderr.close() + os.remove(STDOUT_FILENAME) + os.remove(STDERR_FILENAME) + t.join() + + # Assert success + has_new_server = False + has_leak = False + for line in output_lines: + if "The server is fired" in line: + has_new_server = True + if "leak" in line: + has_leak = True + + assert has_new_server + # assert not has_leak diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index e8fadcef7..3f8a1fecb 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -15,7 +15,7 @@ suites = { "test_embedding_openai_server.py", "test_eval_accuracy_mini.py", "test_json_constrained.py", - # "test_large_max_new_tokens.py", # This test hangs on CI due to unknown reasons + "test_large_max_new_tokens.py", "test_openai_server.py", "test_overlap_schedule.py", "test_pytorch_sampling_backend.py", diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 8e9010395..5f618585f 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -1,65 +1,31 @@ -import unittest -from types import SimpleNamespace +""" +python3 -m unittest test_chunked_prefill.TestChunkedPrefill.test_mixed_chunked_prefill_without_radix_cache +""" + +import unittest -from sglang.srt.utils import kill_child_process -from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, run_bench_serving, + run_mmlu_test, ) class TestChunkedPrefill(unittest.TestCase): - def run_mmlu( - self, disable_radix_cache, enable_mixed_chunk, chunked_prefill_size=32 - ): - other_args = ["--chunked-prefill-size", str(chunked_prefill_size)] - if disable_radix_cache: - other_args += ["--disable-radix-cache"] - - if enable_mixed_chunk: - other_args += ["--enable-mixed-chunk"] - - model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) - - args = SimpleNamespace( - base_url=base_url, - model=model, - eval_name="mmlu", - num_examples=64, - num_threads=32, - ) - - try: - metrics = run_eval(args) - assert metrics["score"] >= 0.65 - finally: - kill_child_process(process.pid) - def test_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=False, enable_mixed_chunk=False) + run_mmlu_test(disable_radix_cache=False, enable_mixed_chunk=False) def test_mixed_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=False, enable_mixed_chunk=True) + run_mmlu_test(disable_radix_cache=False, enable_mixed_chunk=True) def test_chunked_prefill_without_radix_cache(self): - self.run_mmlu(disable_radix_cache=True, enable_mixed_chunk=False) + run_mmlu_test(disable_radix_cache=True, enable_mixed_chunk=False) def test_mixed_chunked_prefill_without_radix_cache(self): - self.run_mmlu(disable_radix_cache=True, enable_mixed_chunk=True) + run_mmlu_test(disable_radix_cache=True, enable_mixed_chunk=True) def test_no_chunked_prefill(self): - self.run_mmlu( + run_mmlu_test( disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1 ) diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index ac0908e0c..8efb83e7b 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -1,3 +1,7 @@ +""" +python3 -m unittest test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion +""" + import os import unittest from concurrent.futures import ThreadPoolExecutor @@ -20,6 +24,10 @@ class TestLargeMaxNewTokens(unittest.TestCase): cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.api_key = "sk-123456" + + cls.stdout = open("stdout.txt", "w") + cls.stderr = open("stderr.txt", "w") + cls.process = popen_launch_server( cls.model, cls.base_url, @@ -27,7 +35,7 @@ class TestLargeMaxNewTokens(unittest.TestCase): api_key=cls.api_key, other_args=("--max-total-token", "1024", "--context-len", "8192"), env={"SGLANG_CLIP_MAX_NEW_TOKENS": "256", **os.environ}, - return_stdout_stderr=True, + return_stdout_stderr=(cls.stdout, cls.stderr), ) cls.base_url += "/v1" cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST) @@ -35,6 +43,10 @@ class TestLargeMaxNewTokens(unittest.TestCase): @classmethod def tearDownClass(cls): kill_child_process(cls.process.pid) + cls.stdout.close() + cls.stderr.close() + os.remove("stdout.txt") + os.remove("stderr.txt") def run_chat_completion(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) @@ -56,16 +68,21 @@ class TestLargeMaxNewTokens(unittest.TestCase): futures = [] with ThreadPoolExecutor(num_requests) as executor: + # Send multiple requests for i in range(num_requests): futures.append(executor.submit(self.run_chat_completion)) - all_requests_running = False - for line in iter(self.process.stderr.readline, ""): - line = str(line) - print(line, end="") - if f"#running-req: {num_requests}" in line: - all_requests_running = True - break + # Ensure that they are running concurrently + pt = 0 + while pt >= 0: + lines = open("stderr.txt").readlines() + for line in lines[pt:]: + print(line, end="", flush=True) + if f"#running-req: {num_requests}" in line: + all_requests_running = True + pt = -1 + break + pt += 1 assert all_requests_running diff --git a/test/srt/test_overlap_schedule.py b/test/srt/test_overlap_schedule.py index 76d17aff2..c3d4a570d 100644 --- a/test/srt/test_overlap_schedule.py +++ b/test/srt/test_overlap_schedule.py @@ -1,65 +1,27 @@ """ Usage: -SGLANG_IS_IN_CI=true python3 -m unittest test_overlap_schedule.TestOverlapSchedule.test_radix_attention_chunked_prefill -SGLANG_IS_IN_CI=true python3 test_overlap_schedule.py +python3 -m unittest test_overlap_schedule.TestOverlapSchedule.test_radix_attention_chunked_prefill +python3 test_overlap_schedule.py """ import unittest -from types import SimpleNamespace -from sglang.srt.utils import kill_child_process -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) +from sglang.test.test_utils import run_mmlu_test class TestOverlapSchedule(unittest.TestCase): - def run_mmlu(self, disable_radix_cache, chunked_prefill_size=32): - other_args = ["--chunked-prefill-size", str(chunked_prefill_size)] - if disable_radix_cache: - other_args += ["--disable-radix-cache"] - other_args += ["--enable-overlap-schedule"] - - model = DEFAULT_MODEL_NAME_FOR_TEST - base_url = DEFAULT_URL_FOR_TEST - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - ) - - args = SimpleNamespace( - base_url=base_url, - model=model, - eval_name="mmlu", - num_examples=64, - num_threads=32, - ) - - try: - metrics = run_eval(args) - assert metrics["score"] >= 0.65 - finally: - kill_child_process(process.pid) - def test_no_radix_attention_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=True, chunked_prefill_size=32) + run_mmlu_test(disable_radix_cache=True, chunked_prefill_size=32) def test_no_radix_attention_no_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=True, chunked_prefill_size=-1) + run_mmlu_test(disable_radix_cache=True, chunked_prefill_size=-1) def test_radix_attention_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=False, chunked_prefill_size=32) + run_mmlu_test(disable_radix_cache=False, chunked_prefill_size=32) def test_radix_attention_no_chunked_prefill(self): - self.run_mmlu(disable_radix_cache=False, chunked_prefill_size=-1) + run_mmlu_test(disable_radix_cache=False, chunked_prefill_size=-1) if __name__ == "__main__": unittest.main() - # @unittest.skip("did not support")