diff --git a/.github/workflows/pr-test-h20.yml b/.github/workflows/pr-test-h20.yml new file mode 100644 index 000000000..e283ea42f --- /dev/null +++ b/.github/workflows/pr-test-h20.yml @@ -0,0 +1,80 @@ +name: PR Test (H20) + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + inputs: + version: + required: true + type: choice + default: 'release' + options: + - 'release' + - 'nightly' + +concurrency: + group: pr-test-h20-${{ github.ref }} + cancel-in-progress: true + +jobs: + check-changes: + runs-on: ubuntu-latest + outputs: + src: ${{ steps.filter.outputs.src }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Detect file changes + id: filter + uses: dorny/paths-filter@v3 + with: + filters: | + src: + - "python/sglang/srt/models/deepseek*" + - "python/sglang/srt/layers/moe/**" + - ".github/workflows/pr-test-h20.yml" + + per-commit-8-gpu-h20: + needs: [check-changes] + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false && + needs.check-changes.outputs.src == 'true' + runs-on: 8-gpu-h20 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci/ci_install_dependency.sh + + - name: Run test + timeout-minutes: 20 + + run: | + cd test/srt + python3 run_suite.py --suite per-commit-8-gpu-h20 + + pr-test-finish: + needs: [ + check-changes, + per-commit-8-gpu-h20, + ] + if: needs.check-changes.outputs.src == 'true' + runs-on: ubuntu-latest + steps: + - name: Check all dependent job statuses + run: | + results=(${{ join(needs.*.result, ' ') }}) + for result in "${results[@]}"; do + if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then + echo "Job failed with result: $result" + exit 1 + fi + done + echo "All jobs completed successfully" + exit 0 diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 8d1e3303d..8b4cb903c 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -78,6 +78,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = ( "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" ) DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B" +DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8" # Nightly tests DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" diff --git a/test/srt/quant/test_w4a8_deepseek_v3.py b/test/srt/quant/test_w4a8_deepseek_v3.py new file mode 100644 index 000000000..eb813bd70 --- /dev/null +++ b/test/srt/quant/test_w4a8_deepseek_v3.py @@ -0,0 +1,122 @@ +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + popen_launch_server, + try_cached_model, + write_github_step_summary, +) + + +class TestDeepseekV3W4afp8(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = try_cached_model(DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST) + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = ["--trust-remote-code", "--tp", "8", "--ep-size", "8"] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=1200, + parallel=1200, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"Eval accuracy of GSM8K: {metrics=}") + + self.assertGreater(metrics["accuracy"], 0.92) + + +class TestDeepseekV3W4Afp8Mtp(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = try_cached_model(DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST) + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--tp", + "8", + "--trust-remote-code", + "--ep-size", + "8", + "--cuda-graph-bs", + "256", + "--disable-radix-cache", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "2", + "--speculative-num-draft-tokens", + "4", + ] + if not is_in_amd_ci(): + other_args += ["--mem-frac", "0.7"] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k( + self, + ): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print(f"{avg_spec_accept_length=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (deepseek-v3 mtp)\n" + f'{metrics["accuracy"]=:.3f}\n' + f"{avg_spec_accept_length=:.2f}\n" + ) + self.assertGreater(metrics["accuracy"], 0.935) + self.assertGreater(avg_spec_accept_length, 2.9) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index bea31af00..c59461e28 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -143,6 +143,9 @@ suites = { "per-commit-8-gpu-deepep": [ TestFile("ep/test_deepep_large.py", 338), ], + "per-commit-8-gpu-h20": [ + TestFile("quant/test_w4a8_deepseek_v3.py", 371), + ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), ],