minor: update gsm8k threshold (#2125)

This commit is contained in:
Yineng Zhang
2024-11-22 19:23:58 +08:00
committed by GitHub
parent 2369e88209
commit 4f8c3aeafc
3 changed files with 45 additions and 35 deletions

View File

@@ -27,14 +27,14 @@ jobs:
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus" pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
- name: Test human eval
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_human_eval.py
- name: Test gsm8k - name: Test gsm8k
timeout-minutes: 120 timeout-minutes: 120
run: | run: |
cd test/srt cd test/srt
python3 test_nightly_gsm8k_eval.py python3 test_nightly_gsm8k_eval.py
- name: Test human eval
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_human_eval.py

View File

@@ -439,18 +439,22 @@ def popen_launch_server(
process = subprocess.Popen(command, stdout=None, stderr=None, env=env) process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
start_time = time.time() start_time = time.time()
while time.time() - start_time < timeout: with requests.Session() as session:
try: while time.time() - start_time < timeout:
headers = { try:
"Content-Type": "application/json; charset=utf-8", headers = {
"Authorization": f"Bearer {api_key}", "Content-Type": "application/json; charset=utf-8",
} "Authorization": f"Bearer {api_key}",
response = requests.get(f"{base_url}/health_generate", headers=headers) }
if response.status_code == 200: response = session.get(
return process f"{base_url}/health_generate",
except requests.RequestException: headers=headers,
pass )
time.sleep(10) if response.status_code == 200:
return process
except requests.RequestException:
pass
time.sleep(10)
raise TimeoutError("Server failed to start within the timeout period.") raise TimeoutError("Server failed to start within the timeout period.")

View File

@@ -1,6 +1,8 @@
import json import json
import os import os
import subprocess
import unittest import unittest
import warnings
from datetime import datetime from datetime import datetime
from types import SimpleNamespace from types import SimpleNamespace
@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
) )
MODEL_SCORE_THRESHOLDS = { MODEL_SCORE_THRESHOLDS = {
"meta-llama/Llama-3.1-8B-Instruct": 0.8316, "meta-llama/Llama-3.1-8B-Instruct": 0.83,
"mistralai/Mistral-7B-Instruct-v0.3": 0.5861, "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
"google/gemma-2-27b-it": 0.9227, "google/gemma-2-27b-it": 0.92,
"meta-llama/Llama-3.1-70B-Instruct": 0.9623, "meta-llama/Llama-3.1-70B-Instruct": 0.96,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415, "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
"Qwen/Qwen2-57B-A14B-Instruct": 0.8791, "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356, "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
"neuralmagic/gemma-2-2b-it-FP8": 0.6059, "neuralmagic/gemma-2-2b-it-FP8": 0.60,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504, "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138, "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504, "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197, "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84,
} }
@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url, base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args, other_args=other_args,
return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
) )
return process return process
@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
kill_child_process(self.process.pid, include_self=True) kill_child_process(self.process.pid, include_self=True)
def test_mgsm_en_all_models(self): def test_mgsm_en_all_models(self):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
)
is_first = True is_first = True
all_results = [] all_results = []