diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index c08e3e25c..aea02c5e5 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -92,7 +92,7 @@ jobs: unittest-test-backend-8-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false - needs: [unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu] + needs: [unit-test-frontend, unit-test-backend-2-gpu] runs-on: 8-gpu-runner steps: - name: Checkout code @@ -271,24 +271,6 @@ jobs: cd test/srt python3 test_moe_eval_accuracy_large.py - unit-test-backend-pd: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 8-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_disaggregation.TestDisaggregationMooncake.test_gsm8k - finish: if: always() needs: [ diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index a6ff1a888..030355e0c 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -305,6 +305,12 @@ class ServerArgs: if self.grammar_backend is None: self.grammar_backend = "xgrammar" + if self.pp_size > 1: + self.disable_overlap_schedule = True + logger.warning( + "Overlap scheduler is disabled because of using pipeline parallelism." + ) + # Data parallelism attention if self.enable_dp_attention: self.schedule_conservativeness = self.schedule_conservativeness * 0.3 diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 4e50cbbdb..7344d1f86 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -5,25 +5,22 @@ set -euxo pipefail SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" bash "${SCRIPT_DIR}/killall_sglang.sh" +# Update pip +pip install --upgrade pip + # Clean up existing installations -pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm || true +pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm pip cache purge rm -rf /root/.cache/flashinfer rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer* rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel* -# Update pip -pip install --upgrade pip - -# Install sgl-kernel -pip install sgl-kernel==0.1.2.post1 --no-cache-dir - # Install the main package pip install -e "python[all]" # Install additional dependencies pip install torch_memory_saver -pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0 +pip install transformers==4.51.0 timm torchaudio==2.6.0 sentence_transformers accelerate peft pandas datasets mooncake-transfer-engine # For compiling xgrammar kernels pip install cuda-python nvidia-cuda-nvrtc-cu12 diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 50cc0d9aa..a70679f50 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -85,9 +85,6 @@ suites = { TestFile("test_w8a8_quantization.py", 46), TestFile("models/lora/test_lora_cuda_graph.py", 250), ], - "per-commit-pd": [ - TestFile("test_disaggregation.py", 90), - ], "per-commit-2-gpu": [ TestFile("models/lora/test_lora_tp.py", 116), TestFile("test_data_parallelism.py", 73), @@ -105,6 +102,7 @@ suites = { # TestFile("test_deepep_low_latency.py", 50), # TestFile("test_moe_deepep_eval_accuracy_large.py", 250), TestFile("test_local_attn.py", 250), + TestFile("test_disaggregation.py", 90), TestFile("test_full_deepseek_v3.py", 250), TestFile("test_pp_single_node.py", 150), ], diff --git a/test/srt/test_disaggregation.py b/test/srt/test_disaggregation.py index ee8bca0b3..e3008598a 100644 --- a/test/srt/test_disaggregation.py +++ b/test/srt/test_disaggregation.py @@ -1,11 +1,9 @@ import subprocess -import threading import time import unittest from types import SimpleNamespace import requests -import torch from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k diff --git a/test/srt/test_pp_single_node.py b/test/srt/test_pp_single_node.py index b69659403..4588c1326 100644 --- a/test/srt/test_pp_single_node.py +++ b/test/srt/test_pp_single_node.py @@ -9,13 +9,10 @@ import time import unittest from types import SimpleNamespace -import requests - from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs from sglang.srt.server_args import ServerArgs from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval -from sglang.test.runners import DEFAULT_PROMPTS from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -28,17 +25,16 @@ from sglang.test.test_utils import ( class TestPPAccuracy(unittest.TestCase): @classmethod def setUpClass(cls): - # These config helps find a leak. - os.environ["SGLANG_IS_IN_CI"] = "1" cls.base_url = "http://127.0.0.1:23333" cls.process = popen_launch_server( DEFAULT_MODEL_NAME_FOR_TEST, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ + "--tp-size", + 2, "--pp-size", 4, - "--disable-overlap-schedule", "--chunked-prefill-size", 256, ], @@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase): time.sleep(5) -# class TestPPAccuracyFlashInfer(unittest.TestCase): -# @classmethod -# def setUpClass(cls): -# # These config helps find a leak. -# os.environ["SGLANG_IS_IN_CI"] = "1" -# cls.base_url = "http://127.0.0.1:23333" -# cls.process = popen_launch_server( -# DEFAULT_MODEL_NAME_FOR_TEST, -# cls.base_url, -# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, -# other_args=[ -# "--pp-size", -# 4, -# "--disable-overlap-schedule", -# "--attention-backend", -# "flashinfer", -# "--chunked-prefill-size", -# 256, -# ], -# ) -# -# @classmethod -# def tearDownClass(cls): -# kill_process_tree(cls.process.pid) -# -# def test_gsm8k(self): -# args = SimpleNamespace( -# num_shots=5, -# data_path=None, -# num_questions=200, -# max_new_tokens=512, -# parallel=128, -# host="http://127.0.0.1", -# port=int(self.base_url.split(":")[-1]), -# ) -# metrics = run_eval(args) -# print(f"{metrics=}") -# -# self.assertGreater(metrics["accuracy"], 0.75) -# # Wait a little bit so that the memory check happens. -# time.sleep(5) - - class TestFixedBugs(unittest.TestCase): def test_chunked_prefill_with_small_bs(self): model = DEFAULT_MODEL_NAME_FOR_TEST @@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase): 2, "--pp-size", 2, - "--disable-overlap-schedule", "--chunked-prefill", 256, "--max-running-requests",