minor: update gsm8k threshold (#2125)

2024-11-22 19:23:58 +08:00
parent 2369e88209
commit 4f8c3aeafc
3 changed files with 45 additions and 35 deletions
--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-eval.yml
@@ -27,14 +27,14 @@ jobs:
          bash scripts/ci_install_dependency.sh
          pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"

-      - name: Test human eval
-        timeout-minutes: 120
-        run: |
-          cd test/srt
-          python3 test_nightly_human_eval.py
-
      - name: Test gsm8k
        timeout-minutes: 120
        run: |
          cd test/srt
          python3 test_nightly_gsm8k_eval.py
+
+      - name: Test human eval
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 test_nightly_human_eval.py
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -439,18 +439,22 @@ def popen_launch_server(
        process = subprocess.Popen(command, stdout=None, stderr=None, env=env)

    start_time = time.time()
-    while time.time() - start_time < timeout:
-        try:
-            headers = {
-                "Content-Type": "application/json; charset=utf-8",
-                "Authorization": f"Bearer {api_key}",
-            }
-            response = requests.get(f"{base_url}/health_generate", headers=headers)
-            if response.status_code == 200:
-                return process
-        except requests.RequestException:
-            pass
-        time.sleep(10)
+    with requests.Session() as session:
+        while time.time() - start_time < timeout:
+            try:
+                headers = {
+                    "Content-Type": "application/json; charset=utf-8",
+                    "Authorization": f"Bearer {api_key}",
+                }
+                response = session.get(
+                    f"{base_url}/health_generate",
+                    headers=headers,
+                )
+                if response.status_code == 200:
+                    return process
+            except requests.RequestException:
+                pass
+            time.sleep(10)
    raise TimeoutError("Server failed to start within the timeout period.")


--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
@@ -1,6 +1,8 @@
 import json
 import os
+import subprocess
 import unittest
+import warnings
 from datetime import datetime
 from types import SimpleNamespace

@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
 )

 MODEL_SCORE_THRESHOLDS = {
-    "meta-llama/Llama-3.1-8B-Instruct": 0.8316,
-    "mistralai/Mistral-7B-Instruct-v0.3": 0.5861,
-    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672,
-    "google/gemma-2-27b-it": 0.9227,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.9623,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.8791,
-    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672,
-    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356,
-    "neuralmagic/gemma-2-2b-it-FP8": 0.6059,
-    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504,
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138,
-    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504,
-    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435,
+    "meta-llama/Llama-3.1-8B-Instruct": 0.83,
+    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
+    "google/gemma-2-27b-it": 0.92,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.96,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
+    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
+    "neuralmagic/gemma-2-2b-it-FP8": 0.60,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
+    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84,
 }


@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
        base_url,
        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
        other_args=other_args,
+        return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
    )
    return process

@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
            kill_child_process(self.process.pid, include_self=True)

    def test_mgsm_en_all_models(self):
+        warnings.filterwarnings(
+            "ignore", category=ResourceWarning, message="unclosed.*socket"
+        )
        is_first = True
        all_results = []