feat: update nightly gsm8k eval (#1304)
This commit is contained in:
45
.github/workflows/nightly-eval.yml
vendored
45
.github/workflows/nightly-eval.yml
vendored
@@ -15,9 +15,9 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
meta-llama-31-8b-instruct:
|
||||
nightly-eval-2-gpu:
|
||||
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
|
||||
runs-on: 1-gpu-runner
|
||||
runs-on: 2-gpu-runner
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
@@ -25,42 +25,11 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install -e "python[dev]"
|
||||
pip install -e "python[all]"
|
||||
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
|
||||
git clone https://github.com/EleutherAI/lm-evaluation-harness
|
||||
pushd lm-evaluation-harness
|
||||
pip install -e .
|
||||
pip install lm_eval[api]
|
||||
popd
|
||||
|
||||
- name: Run eval
|
||||
timeout-minutes: 20
|
||||
- name: Nightly gsm8k Accuracy
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache &
|
||||
|
||||
echo "Waiting for server to start..."
|
||||
for i in {1..120}; do
|
||||
if curl -s http://127.0.0.1:30000/health; then
|
||||
echo "Server is up!"
|
||||
break
|
||||
fi
|
||||
if [ $i -eq 120 ]; then
|
||||
echo "Server failed to start within 120 seconds"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
lm_eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=128,max_retries=3,tokenized_requests=False
|
||||
|
||||
echo "Stopping server..."
|
||||
kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}')
|
||||
|
||||
finish:
|
||||
needs: [
|
||||
meta-llama-31-8b-instruct
|
||||
]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Finish
|
||||
run: echo "This is an empty step to ensure that all jobs are completed."
|
||||
cd test/srt
|
||||
python3 test_nightly_gsm8k_eval.py
|
||||
|
||||
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
|
||||
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
|
||||
|
||||
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
||||
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
||||
|
||||
89
test/srt/test_nightly_gsm8k_eval.py
Normal file
89
test/srt/test_nightly_gsm8k_eval.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sglang.srt.utils import kill_child_process
|
||||
from sglang.test.run_eval import run_eval
|
||||
from sglang.test.test_utils import (
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
|
||||
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
|
||||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
DEFAULT_URL_FOR_TEST,
|
||||
popen_launch_server,
|
||||
)
|
||||
|
||||
|
||||
def parse_models(model_string):
|
||||
return [model.strip() for model in model_string.split(",") if model.strip()]
|
||||
|
||||
|
||||
class TestEvalAccuracyLarge(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model_groups = [
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
|
||||
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
|
||||
]
|
||||
cls.base_url = DEFAULT_URL_FOR_TEST
|
||||
|
||||
def setUp(self):
|
||||
self.process = None
|
||||
|
||||
def tearDown(self):
|
||||
if self.process:
|
||||
kill_child_process(self.process.pid)
|
||||
|
||||
def launch_server(self, model, is_fp8, is_tp2):
|
||||
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
|
||||
if is_fp8:
|
||||
if "Llama-3" in model or "gemma-2" in model:
|
||||
# compressed-tensors
|
||||
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
|
||||
elif "Qwen2-72B-Instruct-FP8" in model:
|
||||
# bug
|
||||
other_args.extend(["--quantization", "fp8"])
|
||||
else:
|
||||
other_args.extend(
|
||||
["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]
|
||||
)
|
||||
if is_tp2:
|
||||
other_args.extend(["--tp", "2"])
|
||||
if "DeepSeek" in model:
|
||||
other_args.append("--enable-mla")
|
||||
|
||||
self.process = popen_launch_server(
|
||||
model,
|
||||
self.base_url,
|
||||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
||||
other_args=other_args,
|
||||
)
|
||||
|
||||
def test_mgsm_en_all_models(self):
|
||||
for model_group, is_fp8, is_tp2 in self.model_groups:
|
||||
for model in model_group:
|
||||
with self.subTest(model=model):
|
||||
self.launch_server(model, is_fp8, is_tp2)
|
||||
|
||||
args = SimpleNamespace(
|
||||
base_url=self.base_url,
|
||||
model=model,
|
||||
eval_name="mgsm_en",
|
||||
num_examples=None,
|
||||
num_threads=1024,
|
||||
)
|
||||
|
||||
metrics = run_eval(args)
|
||||
print(
|
||||
f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
|
||||
)
|
||||
# loosely threshold
|
||||
assert metrics["score"] > 0.5, f"score={metrics['score']} <= 0.5"
|
||||
|
||||
self.tearDown()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user