From 2561ed012ce10e109ac888f7e9e7ffe44ccb4a94 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 3 Sep 2024 01:18:41 +1000 Subject: [PATCH] feat: update nightly gsm8k eval (#1304) --- .github/workflows/nightly-eval.yml | 45 +++------------ python/sglang/test/test_utils.py | 4 ++ test/srt/test_nightly_gsm8k_eval.py | 89 +++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 38 deletions(-) create mode 100644 test/srt/test_nightly_gsm8k_eval.py diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml index c1a7e9c17..4ac911c9a 100644 --- a/.github/workflows/nightly-eval.yml +++ b/.github/workflows/nightly-eval.yml @@ -15,9 +15,9 @@ concurrency: cancel-in-progress: true jobs: - meta-llama-31-8b-instruct: + nightly-eval-2-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: 1-gpu-runner + runs-on: 2-gpu-runner steps: - name: Checkout code uses: actions/checkout@v3 @@ -25,42 +25,11 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip - pip install -e "python[dev]" + pip install -e "python[all]" pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall - git clone https://github.com/EleutherAI/lm-evaluation-harness - pushd lm-evaluation-harness - pip install -e . - pip install lm_eval[api] - popd - - name: Run eval - timeout-minutes: 20 + - name: Nightly gsm8k Accuracy + timeout-minutes: 60 run: | - python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache & - - echo "Waiting for server to start..." - for i in {1..120}; do - if curl -s http://127.0.0.1:30000/health; then - echo "Server is up!" - break - fi - if [ $i -eq 120 ]; then - echo "Server failed to start within 120 seconds" - exit 1 - fi - sleep 1 - done - - lm_eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=128,max_retries=3,tokenized_requests=False - - echo "Stopping server..." - kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}') - - finish: - needs: [ - meta-llama-31-8b-instruct - ] - runs-on: ubuntu-latest - steps: - - name: Finish - run: echo "This is an empty step to ensure that all jobs are completed." + cd test/srt + python3 test_nightly_gsm8k_eval.py diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index d6a1792b8..1b9b63e88 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8" if os.getenv("SGLANG_IS_IN_CI", "false") == "true": DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157 diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py new file mode 100644 index 000000000..35e7d6eb7 --- /dev/null +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -0,0 +1,89 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1, + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +def parse_models(model_string): + return [model.strip() for model in model_string.split(",") if model.strip()] + + +class TestEvalAccuracyLarge(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model_groups = [ + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), + ] + cls.base_url = DEFAULT_URL_FOR_TEST + + def setUp(self): + self.process = None + + def tearDown(self): + if self.process: + kill_child_process(self.process.pid) + + def launch_server(self, model, is_fp8, is_tp2): + other_args = ["--log-level-http", "warning", "--trust-remote-code"] + if is_fp8: + if "Llama-3" in model or "gemma-2" in model: + # compressed-tensors + other_args.extend(["--kv-cache-dtype", "fp8_e5m2"]) + elif "Qwen2-72B-Instruct-FP8" in model: + # bug + other_args.extend(["--quantization", "fp8"]) + else: + other_args.extend( + ["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"] + ) + if is_tp2: + other_args.extend(["--tp", "2"]) + if "DeepSeek" in model: + other_args.append("--enable-mla") + + self.process = popen_launch_server( + model, + self.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + def test_mgsm_en_all_models(self): + for model_group, is_fp8, is_tp2 in self.model_groups: + for model in model_group: + with self.subTest(model=model): + self.launch_server(model, is_fp8, is_tp2) + + args = SimpleNamespace( + base_url=self.base_url, + model=model, + eval_name="mgsm_en", + num_examples=None, + num_threads=1024, + ) + + metrics = run_eval(args) + print( + f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" + ) + # loosely threshold + assert metrics["score"] > 0.5, f"score={metrics['score']} <= 0.5" + + self.tearDown() + + +if __name__ == "__main__": + unittest.main()