diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml index 16bb584f4..b60a9c6d4 100644 --- a/.github/workflows/accuracy-test.yml +++ b/.github/workflows/accuracy-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: accuracy-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: accuracy + runs-on: accuracy-test steps: - name: Checkout code @@ -28,9 +28,6 @@ jobs: - name: Install dependencies run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall @@ -40,7 +37,7 @@ jobs: pip install -e . - name: Evaluate Accuracy + timeout-minutes: 20 run: | cd test/srt python3 test_eval_accuracy_large.py - timeout-minutes: 20 diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index ad271c37e..8d3387041 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: e2e-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: e2e + runs-on: e2e-test steps: - name: Checkout code @@ -28,27 +28,24 @@ jobs: - name: Install dependencies run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - name: Benchmark Serving Throughput + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default - timeout-minutes: 10 - name: Benchmark Serving Throughput (w/o RadixAttention) + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache - timeout-minutes: 10 - name: Benchmark Serving Throughput (w/o ChunkedPrefill) + timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill - timeout-minutes: 10 diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml index 51f7d0226..dd5665a3f 100644 --- a/.github/workflows/moe-test.yml +++ b/.github/workflows/moe-test.yml @@ -18,30 +18,28 @@ concurrency: cancel-in-progress: true jobs: - moe-test: - if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: accuracy - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - - pip install --upgrade pip - pip install -e "python[all]" - pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + moe-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: moe-test - - name: Benchmark MOE Serving Throughput - uses: nick-fields/retry@v3 - with: - timeout_minutes: 15 - max_attempts: 2 - retry_on: error - command: | - cd test/srt - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default - python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + - name: Benchmark MoE Serving Throughput + timeout_minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default + + - name: Benchmark MoE Serving Throughput (w/o RadixAttention) + timeout_minutes: 10 + run: | + cd test/srt + python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 607cb865d..e2d7951be 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: unit-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: unit + runs-on: unit-test steps: - name: Checkout code @@ -28,9 +28,6 @@ jobs: - name: Install dependencies run: | - source $HOME/venv/bin/activate - echo "$HOME/venv/bin" >> $GITHUB_PATH - pip install --upgrade pip pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall @@ -38,13 +35,13 @@ jobs: pip install sentence_transformers - name: Test Backend Runtime + timeout-minutes: 20 run: | cd test/srt python3 run_suite.py --suite minimal - timeout-minutes: 20 - name: Test Frontend Language + timeout-minutes: 10 run: | cd test/lang python3 run_suite.py --suite minimal - timeout-minutes: 10 diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index 9a5bd4fd5..37ed2cf9a 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -24,6 +24,7 @@ import torch.nn.functional as F from transformers import AutoModelForCausalLM, AutoTokenizer from sglang.srt.server import Runtime +from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER DEFAULT_PROMPTS = [ # the output of gemma-2-2b from SRT is unstable on the commented prompt @@ -171,7 +172,7 @@ class SRTRunner: torch_dtype, is_generation, tp_size=1, - port=5157, + port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER, ): self.is_generation = is_generation self.runtime = Runtime( diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index ac19d9370..3389e619c 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -25,6 +25,7 @@ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157 DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157" DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157" DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157" diff --git a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py index e72dc30f9..4e91f7235 100644 --- a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py +++ b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py @@ -5,7 +5,11 @@ from multiprocessing import Process import requests from sglang.srt.utils import kill_child_process -from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_UNIT_TEST, + popen_launch_server, +) class TestBatchPenalizerE2E(unittest.TestCase): @@ -13,7 +17,7 @@ class TestBatchPenalizerE2E(unittest.TestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = f"http://127.0.0.1:{8157}" + cls.base_url = DEFAULT_URL_FOR_UNIT_TEST cls.process = popen_launch_server( cls.model, cls.base_url, diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index 0f136fe6e..48157b8db 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -67,7 +67,7 @@ class TestOpenAIVisionServer(unittest.TestCase): assert response.choices[0].message.role == "assistant" text = response.choices[0].message.content assert isinstance(text, str) - assert "car" in text or "taxi" in text, text + assert "logo" in text, text assert response.id assert response.created assert response.usage.prompt_tokens > 0