[CI] Reorganize the 8 gpu tests (#6192)
This commit is contained in:
20
.github/workflows/pr-test.yml
vendored
20
.github/workflows/pr-test.yml
vendored
@@ -92,7 +92,7 @@ jobs:
|
||||
unittest-test-backend-8-gpu:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
github.event.pull_request.draft == false
|
||||
needs: [unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu]
|
||||
needs: [unit-test-frontend, unit-test-backend-2-gpu]
|
||||
runs-on: 8-gpu-runner
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -271,24 +271,6 @@ jobs:
|
||||
cd test/srt
|
||||
python3 test_moe_eval_accuracy_large.py
|
||||
|
||||
unit-test-backend-pd:
|
||||
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
|
||||
github.event.pull_request.draft == false
|
||||
runs-on: 8-gpu-runner
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
bash scripts/ci_install_dependency.sh
|
||||
|
||||
- name: Run test
|
||||
timeout-minutes: 10
|
||||
run: |
|
||||
cd test/srt
|
||||
python3 -m unittest test_disaggregation.TestDisaggregationMooncake.test_gsm8k
|
||||
|
||||
finish:
|
||||
if: always()
|
||||
needs: [
|
||||
|
||||
@@ -305,6 +305,12 @@ class ServerArgs:
|
||||
if self.grammar_backend is None:
|
||||
self.grammar_backend = "xgrammar"
|
||||
|
||||
if self.pp_size > 1:
|
||||
self.disable_overlap_schedule = True
|
||||
logger.warning(
|
||||
"Overlap scheduler is disabled because of using pipeline parallelism."
|
||||
)
|
||||
|
||||
# Data parallelism attention
|
||||
if self.enable_dp_attention:
|
||||
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
||||
|
||||
@@ -5,25 +5,22 @@ set -euxo pipefail
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
bash "${SCRIPT_DIR}/killall_sglang.sh"
|
||||
|
||||
# Update pip
|
||||
pip install --upgrade pip
|
||||
|
||||
# Clean up existing installations
|
||||
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm || true
|
||||
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm
|
||||
pip cache purge
|
||||
rm -rf /root/.cache/flashinfer
|
||||
rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer*
|
||||
rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
|
||||
|
||||
# Update pip
|
||||
pip install --upgrade pip
|
||||
|
||||
# Install sgl-kernel
|
||||
pip install sgl-kernel==0.1.2.post1 --no-cache-dir
|
||||
|
||||
# Install the main package
|
||||
pip install -e "python[all]"
|
||||
|
||||
# Install additional dependencies
|
||||
pip install torch_memory_saver
|
||||
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
|
||||
pip install transformers==4.51.0 timm torchaudio==2.6.0 sentence_transformers accelerate peft pandas datasets mooncake-transfer-engine
|
||||
|
||||
# For compiling xgrammar kernels
|
||||
pip install cuda-python nvidia-cuda-nvrtc-cu12
|
||||
|
||||
@@ -85,9 +85,6 @@ suites = {
|
||||
TestFile("test_w8a8_quantization.py", 46),
|
||||
TestFile("models/lora/test_lora_cuda_graph.py", 250),
|
||||
],
|
||||
"per-commit-pd": [
|
||||
TestFile("test_disaggregation.py", 90),
|
||||
],
|
||||
"per-commit-2-gpu": [
|
||||
TestFile("models/lora/test_lora_tp.py", 116),
|
||||
TestFile("test_data_parallelism.py", 73),
|
||||
@@ -105,6 +102,7 @@ suites = {
|
||||
# TestFile("test_deepep_low_latency.py", 50),
|
||||
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
|
||||
TestFile("test_local_attn.py", 250),
|
||||
TestFile("test_disaggregation.py", 90),
|
||||
TestFile("test_full_deepseek_v3.py", 250),
|
||||
TestFile("test_pp_single_node.py", 150),
|
||||
],
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
|
||||
|
||||
@@ -9,13 +9,10 @@ import time
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
import requests
|
||||
|
||||
from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
|
||||
from sglang.srt.server_args import ServerArgs
|
||||
from sglang.srt.utils import kill_process_tree
|
||||
from sglang.test.few_shot_gsm8k import run_eval
|
||||
from sglang.test.runners import DEFAULT_PROMPTS
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
@@ -28,17 +25,16 @@ from sglang.test.test_utils import (
|
||||
class TestPPAccuracy(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
# These config helps find a leak.
|
||||
os.environ["SGLANG_IS_IN_CI"] = "1"
|
||||
cls.base_url = "http://127.0.0.1:23333"
|
||||
cls.process = popen_launch_server(
|
||||
DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
cls.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=[
|
||||
"--tp-size",
|
||||
2,
|
||||
"--pp-size",
|
||||
4,
|
||||
"--disable-overlap-schedule",
|
||||
"--chunked-prefill-size",
|
||||
256,
|
||||
],
|
||||
@@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase):
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
# class TestPPAccuracyFlashInfer(unittest.TestCase):
|
||||
# @classmethod
|
||||
# def setUpClass(cls):
|
||||
# # These config helps find a leak.
|
||||
# os.environ["SGLANG_IS_IN_CI"] = "1"
|
||||
# cls.base_url = "http://127.0.0.1:23333"
|
||||
# cls.process = popen_launch_server(
|
||||
# DEFAULT_MODEL_NAME_FOR_TEST,
|
||||
# cls.base_url,
|
||||
# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
# other_args=[
|
||||
# "--pp-size",
|
||||
# 4,
|
||||
# "--disable-overlap-schedule",
|
||||
# "--attention-backend",
|
||||
# "flashinfer",
|
||||
# "--chunked-prefill-size",
|
||||
# 256,
|
||||
# ],
|
||||
# )
|
||||
#
|
||||
# @classmethod
|
||||
# def tearDownClass(cls):
|
||||
# kill_process_tree(cls.process.pid)
|
||||
#
|
||||
# def test_gsm8k(self):
|
||||
# args = SimpleNamespace(
|
||||
# num_shots=5,
|
||||
# data_path=None,
|
||||
# num_questions=200,
|
||||
# max_new_tokens=512,
|
||||
# parallel=128,
|
||||
# host="http://127.0.0.1",
|
||||
# port=int(self.base_url.split(":")[-1]),
|
||||
# )
|
||||
# metrics = run_eval(args)
|
||||
# print(f"{metrics=}")
|
||||
#
|
||||
# self.assertGreater(metrics["accuracy"], 0.75)
|
||||
# # Wait a little bit so that the memory check happens.
|
||||
# time.sleep(5)
|
||||
|
||||
|
||||
class TestFixedBugs(unittest.TestCase):
|
||||
def test_chunked_prefill_with_small_bs(self):
|
||||
model = DEFAULT_MODEL_NAME_FOR_TEST
|
||||
@@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase):
|
||||
2,
|
||||
"--pp-size",
|
||||
2,
|
||||
"--disable-overlap-schedule",
|
||||
"--chunked-prefill",
|
||||
256,
|
||||
"--max-running-requests",
|
||||
|
||||
Reference in New Issue
Block a user