diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml new file mode 100644 index 000000000..fd3ce4225 --- /dev/null +++ b/.github/workflows/nightly-test-amd.yml @@ -0,0 +1,54 @@ +name: Nightly Test (AMD) + +on: + schedule: + - cron: '0 0 * * *' + push: + branches: + - main + paths: + - "python/sglang/version.py" + workflow_dispatch: + +concurrency: + group: nightly-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + nightly-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: linux-mi300-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup docker + run: | + # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. + if [ -f "/etc/podinfo/gha-render-devices" ]; then + DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) + else + DEVICE_FLAG="--device /dev/dri" + fi + touch github_summary.md + docker pull ghcr.io/saienduri/sglang-aiter-v0.1.1:428 + docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ + -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ + --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \ + -w /sglang-checkout --name ci_sglang \ + ghcr.io/saienduri/sglang-aiter-v0.1.1:428 + + - name: Install dependencies + run: | + docker exec ci_sglang pip install --upgrade pip + docker exec ci_sglang pip uninstall sgl-kernel -y || true + docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" + docker exec ci_sglang pip install -e "python[dev_hip]" + + docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git + docker exec -w /human-eval ci_sglang pip install -e . + + - name: Nightly Test + run: | + docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" ci_sglang python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 + echo "$(> $GITHUB_STEP_SUMMARY diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 9ae90b6d4..728739f75 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -100,6 +100,9 @@ suites = { "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), ], + "nightly-amd": [ + TestFile("test_nightly_gsm8k_eval_amd.py"), + ], "vllm_dependency_test": [ TestFile("test_vllm_dependency.py"), TestFile("test_awq.py"), diff --git a/test/srt/test_nightly_gsm8k_eval_amd.py b/test/srt/test_nightly_gsm8k_eval_amd.py new file mode 100644 index 000000000..bdb6babbd --- /dev/null +++ b/test/srt/test_nightly_gsm8k_eval_amd.py @@ -0,0 +1,192 @@ +import json +import os +import unittest +import warnings +from datetime import datetime +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) + +MODEL_SCORE_THRESHOLDS = { + "meta-llama/Llama-3.1-8B-Instruct": 0.82, + "mistralai/Mistral-7B-Instruct-v0.3": 0.56, + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, + "meta-llama/Llama-3.1-70B-Instruct": 0.95, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, + "Qwen/Qwen2-57B-A14B-Instruct": 0.86, + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.81, + "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, + "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94, + "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94, + "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, +} + +# Models currently failing on AMD MI300x. +failing_models = { + "google/gemma-2-27b-it", + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8", + "neuralmagic/gemma-2-2b-it-FP8", + "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8", +} + + +def remove_failing_models(model_str): + models = model_str.split(",") + filtered = [m for m in models if m not in failing_models] + return ",".join(filtered) + + +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = remove_failing_models( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 +) +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = remove_failing_models( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 +) +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = remove_failing_models( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 +) +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = remove_failing_models( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 +) + + +def parse_models(model_string): + return [model.strip() for model in model_string.split(",") if model.strip()] + + +def popen_launch_server_wrapper(base_url, model, is_tp2): + other_args = ["--log-level-http", "warning", "--trust-remote-code"] + if is_tp2: + other_args.extend(["--tp", "2"]) + + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + return process + + +def write_results_to_json(model, metrics, mode="a"): + result = { + "timestamp": datetime.now().isoformat(), + "model": model, + "metrics": metrics, + "score": metrics["score"], + } + + existing_results = [] + if mode == "a" and os.path.exists("results.json"): + try: + with open("results.json", "r") as f: + existing_results = json.load(f) + except json.JSONDecodeError: + existing_results = [] + + if isinstance(existing_results, list): + existing_results.append(result) + else: + existing_results = [result] + + with open("results.json", "w") as f: + json.dump(existing_results, f, indent=2) + + +def check_model_scores(results): + failed_models = [] + summary = " | model | score | threshold |\n" + summary += "| ----- | ----- | --------- |\n" + + for model, score in results: + threshold = MODEL_SCORE_THRESHOLDS.get(model) + if threshold is None: + print(f"Warning: No threshold defined for model {model}") + continue + + if score < threshold: + failed_models.append( + f"\nScore Check Failed: {model}\n" + f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" + ) + + line = f"| {model} | {score} | {threshold} |\n" + summary += line + + print(summary) + + if is_in_ci(): + write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}") + + if failed_models: + raise AssertionError("\n".join(failed_models)) + + +# Do not use `CustomTestCase` since `test_mgsm_en_all_models` does not want retry +class TestNightlyGsm8KEval(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model_groups = [ + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), + ] + cls.base_url = DEFAULT_URL_FOR_TEST + + def test_mgsm_en_all_models(self): + warnings.filterwarnings( + "ignore", category=ResourceWarning, message="unclosed.*socket" + ) + is_first = True + all_results = [] + + for model_group, is_fp8, is_tp2 in self.model_groups: + for model in model_group: + with self.subTest(model=model): + process = popen_launch_server_wrapper(self.base_url, model, is_tp2) + + args = SimpleNamespace( + base_url=self.base_url, + model=model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + print( + f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" + ) + + write_results_to_json(model, metrics, "w" if is_first else "a") + is_first = False + + all_results.append((model, metrics["score"])) + kill_process_tree(process.pid) + + try: + with open("results.json", "r") as f: + print("\nFinal Results from results.json:") + print(json.dumps(json.load(f), indent=2)) + except Exception as e: + print(f"Error reading results.json: {e}") + + # Check all scores after collecting all results + check_model_scores(all_results) + + +if __name__ == "__main__": + unittest.main()