diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml
new file mode 100644
index 000000000..9c8e7bfeb
--- /dev/null
+++ b/.github/workflows/accuracy-test.yml
@@ -0,0 +1,45 @@
+name: Accuracy Test
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  workflow_dispatch:
+
+concurrency:
+  group: accuracy-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  accuracy-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: accuracy
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Install dependencies
+      run: |
+        source $HOME/venv/bin/activate
+        echo "$HOME/venv/bin" >> $GITHUB_PATH
+
+        pip install --upgrade pip
+        pip install -e "python[all]"
+        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+
+        git clone https://github.com/merrymercy/human-eval.git
+        cd human-eval
+        pip install -e .
+
+    - name: Evaluate Accuracy
+      run: |
+        cd test/srt
+        python3 test_eval_accuracy_large.py
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
index 78ac4d9ec..336f6a14f 100644
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
   e2e-test:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: bench
+    runs-on: e2e
 
     steps:
     - name: Checkout code
diff --git a/README.md b/README.md
index f81593ef6..1d7ff009b 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
 ```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 2048
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
 ```
 - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index e3a2ad0a2..0f9c88223 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -669,19 +669,20 @@ async def benchmark(
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
-            "total_input": metrics.total_input,
-            "total_output": metrics.total_output,
-            "total_output_retokenized": metrics.total_output_retokenized,
-            "mean_e2e_latency": metrics.mean_e2e_latency_ms,
-            "median_e2e_latency": metrics.median_e2e_latency_ms,
-            "median_ttft": metrics.median_ttft_ms,
-            "median_itl": metrics.median_itl_ms,
-            "output_token_throughput": metrics.output_throughput,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "output_throughput": metrics.output_throughput,
             "sharegpt_output_len": args.sharegpt_output_len,
             "random_input_len": args.random_input_len,
             "random_output_len": args.random_output_len,
             "random_range_ratio": args.random_range_ratio,
-            "benchmark_duration": benchmark_duration,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
         }
     else:
         print(f"Error running benchmark for request rate: {request_rate}")
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index 4c757737e..f14885263 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -64,8 +64,7 @@ from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
 
 
-# TODO: Rename "CI" to "SGLANG_IS_IN_CI".
-crash_on_warning = os.getenv("CI", "false") == "true"
+crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true"
 
 
 class ModelTpServer:
diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py
index 6c1f284b1..3d13d475b 100644
--- a/python/sglang/test/run_eval.py
+++ b/python/sglang/test/run_eval.py
@@ -39,6 +39,14 @@ def run_eval(args):
         eval_obj = MathEval(
             filename, equality_checker, args.num_examples, args.num_threads
         )
+    elif args.eval_name == "mgsm":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+
+        eval_obj = MGSMEval(args.num_examples, args.num_threads)
+    elif args.eval_name == "mgsm_en":
+        from sglang.test.simple_eval_mgsm import MGSMEval
+
+        eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"])
     elif args.eval_name == "gpqa":
         from sglang.test.simple_eval_gpqa import GPQAEval
 
diff --git a/python/sglang/test/simple_eval_mgsm.py b/python/sglang/test/simple_eval_mgsm.py
new file mode 100644
index 000000000..ce00a1ac7
--- /dev/null
+++ b/python/sglang/test/simple_eval_mgsm.py
@@ -0,0 +1,203 @@
+# Adapted from https://github.com/openai/simple-evals/
+
+"""
+MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems. 
+Language Models are Multilingual Chain-of-Thought Reasoners
+Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
+https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp 
+"""
+
+import re
+import urllib
+from typing import Optional
+
+from sglang.test import simple_eval_common as common
+from sglang.test.simple_eval_common import (
+    HTML_JINJA,
+    Eval,
+    EvalResult,
+    SamplerBase,
+    SingleEvalResult,
+)
+
+ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"]
+LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"]
+NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"]
+
+LANG_TO_FPATH = {
+    "bn": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_bn.tsv",
+    "de": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_de.tsv",
+    "en": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_en.tsv",
+    "es": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_es.tsv",
+    "fr": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_fr.tsv",
+    "ja": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ja.tsv",
+    "ru": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ru.tsv",
+    "sw": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_sw.tsv",
+    "te": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_te.tsv",
+    "th": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_th.tsv",
+    "zh": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_zh.tsv",
+}
+LANG_TO_INSTRUCTIONS = {
+    "en": """Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:".
+
+{input}""",
+    "bn": """এই গণিতের সমস্যাটি সমাধান করুন। চূড়ান্ত উত্তর দেওয়ার আগে যুক্তিসম্পন্ন পদক্ষেপ প্রদান করুন। চূড়ান্ত উত্তরটি একক সংখ্যা হিসাবে "উত্তর:" এর পরে শেষ লাইনে দিন। "উত্তর:" এর পরে অন্য কিছু যুক্ত করবেন না।.
+
+{input}""",
+    "de": """Löse dieses Mathematikproblem. Gib die Schritte zur Begründung an, bevor du die endgültige Antwort in der letzten Zeile alleine im Format "Antwort:" gibst. Füge nichts anderes als die ganzzahlige Antwort nach "Antwort:" hinzu.
+
+{input}""",
+    "es": """Resuelve este problema matemático. Proporciona los pasos de razonamiento antes de dar la respuesta final en la última línea por sí misma en el formato de "Respuesta:". No añadas nada más que la respuesta entera después de "Respuesta:".
+
+{input}""",
+    "fr": """Résolvez ce problème de mathématiques. Donnez les étapes de raisonnement avant de fournir la réponse finale sur la dernière ligne elle-même dans le format de "Réponse:". N'ajoutez rien d'autre que la réponse entière après "Réponse:".
+
+{input}""",
+    "ja": """の数学の問題を解いてください。最終的な答えを出す前に、解答の推論過程を記述してください。そして最後の行には "答え:" の形式で答えを記述し、その後には整数の答え以外何も追加しないでください。
+
+{input}""",
+    "ru": """Решите эту математическую задачу. Объясните шаги рассуждения перед тем, как дать окончательный ответ в последней строке сам по себе в формате "Ответ:". Не добавляйте ничего, кроме целочисленного ответа после "Ответ:".
+
+{input}""",
+    "sw": """Suluhisha tatizo hili la hesabu. Toa hatua za mantiki kabla ya kutoa jibu la mwisho kwenye mstari wa mwisho peke yake katika muundo wa "Jibu:". Usiongeze chochote kingine isipokuwa jibu la integer baada ya "Jibu:".
+
+{input}""",
+    "te": """ఈ గణిత సమస్యను పరిష్కరించండి. చివరి సమాధానాన్ని ఇవ్వదానికి ముందు తర్కాత్మక అదుగులను ఇవ్వండి. చివరి పంక్తిలో మాత్రమే 'సమాధానం:' అనే ఆకారంలో చివరి సమాధానాద్ని ఇవ్వండి సమాధానం: తర్వాత పూర్ణాంక సమాధానానికి తప్పించి ఎదేనా చేర్చవద్దు.
+
+{input}""",
+    "th": """แก้ปัญหาคณิตศาสตร์นี้ ให้ให้ขั้นตอนการใช้เหตุผลก่อนที่จะให้คำตอบสุดท้ายในบรรทัดสุดท้ายโดยอยู่ในรูปแบบ "คำตอบ:" ไม่ควรเพิ่มอะไรนอกจากคำตอบที่เป็นจำนวนเต็มหลังจาก "คำตอบ:"
+
+{input}""",
+    "zh": """解决这个数学问题。在最后一行给出答案前，请提供推理步骤。最后一行应该以 "答案: " 的形式独立给出答案。在 "答案：" 后不要添加除整数答案之外的任何内容。
+
+{input}""",
+}
+
+LANG_TO_ANSWER_PREFIX = {
+    "en": "Answer",
+    "bn": "উত্তর",
+    "de": "Antwort",
+    "es": "Respuesta",
+    "fr": "Réponse",
+    "ja": "答え",
+    "ru": "Ответ",
+    "sw": "Jibu",
+    "te": "సమాధానం",
+    "th": "คำตอบ",
+    "zh": "答案",
+}
+
+
+def parse_answer(answer: str, answer_prefix: str) -> str:
+    if answer_prefix not in answer:
+        return ""
+
+    answer_text = answer.split(answer_prefix)[-1].strip()
+
+    # find all the numbers (including decimals) in the string
+    numbers = re.findall(r"\d+\.?\d*", answer_text.replace(",", ""))
+
+    # return the first number (removing trailing decimal point if present),
+    # or an empty string if there were no numbers
+    return numbers[-1].rstrip(".") if numbers else ""
+
+
+def score_mgsm(target: str, prediction: str) -> bool:
+    if "." in prediction:
+        prediction = prediction.rstrip("0").rstrip(".")
+
+    target = target.replace(",", "")
+    prediction = prediction.replace(",", "")
+
+    return target == prediction
+
+
+def get_lang_examples(lang: str) -> list[dict[str, str]]:
+    fpath = LANG_TO_FPATH[lang]
+    examples = []
+    with urllib.request.urlopen(fpath) as f:
+        for line in f.read().decode("utf-8").splitlines():
+            inputs, targets = line.strip().split("\t")
+            if "." in targets:
+                raise ValueError(f"targets {targets} contains a decimal point.")
+            # targets = int(targets.replace(",", ""))
+            examples.append({"inputs": inputs, "targets": targets, "lang": lang})
+    return examples
+
+
+def get_all_examples() -> list[dict[str, str]]:
+    examples = []
+    for lang in ALL_LANGUAGES:
+        if lang != "en":
+            continue
+        examples += get_lang_examples(lang)
+    return examples
+
+
+class MGSMEval(Eval):
+    def __init__(
+        self,
+        num_examples_per_lang: int = 250,  # restrict to a subset of the data for debugging
+        num_threads: int = 64,
+        languages: Optional[list[str]] = ALL_LANGUAGES,
+    ):
+        if languages is None:
+            languages = ALL_LANGUAGES
+        else:
+            for language in languages:
+                if language not in ALL_LANGUAGES:
+                    raise ValueError(
+                        f"language {language} is not a valid language. "
+                        f"It should be one in {ALL_LANGUAGES}"
+                    )
+        self._languages = languages
+        self._num_examples_per_lang = num_examples_per_lang
+        self._num_threads = num_threads
+
+        examples = []
+        for lang in self._languages:
+            lang_examples = get_lang_examples(lang)
+            examples.extend(lang_examples[: self._num_examples_per_lang])
+        self.examples = examples
+
+    def __call__(self, sampler: SamplerBase) -> EvalResult:
+        def fn(example: dict[str, str]):
+            language = example["lang"]
+            latin_language = (
+                "group_latin" if language in LATIN_LANGUAGES else "group_non_latin"
+            )
+            correct_answer = example["targets"]
+            instructoin = LANG_TO_INSTRUCTIONS[language]
+            prompt_messages = [
+                sampler._pack_message(
+                    content=instructoin.format(input=example["inputs"]), role="user"
+                )
+            ]
+            try:
+                response_text = sampler(prompt_messages)
+            except Exception as e:
+                response_text = ""
+
+            answer_prefix = LANG_TO_ANSWER_PREFIX[language]
+            extracted_answer = parse_answer(response_text, answer_prefix)
+
+            score = score_mgsm(correct_answer, extracted_answer)
+            html = common.jinja_env.from_string(HTML_JINJA).render(
+                prompt_messages=prompt_messages,
+                next_message=dict(content=response_text, role="assistant"),
+                score=score,
+                correct_answer=correct_answer,
+                extracted_answer=extracted_answer,
+            )
+            convo = prompt_messages + [dict(content=response_text, role="assistant")]
+            return SingleEvalResult(
+                html=html,
+                score=score,
+                convo=convo,
+                metrics={language: score, latin_language: score},
+            )
+
+        results = common.map_with_progress(
+            fn, self.examples, num_threads=self._num_threads
+        )
+        return common.aggregate_results(results, default_stats=("mean", "std"))
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 08122389f..c99b6a60b 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -7,7 +7,7 @@ suites = {
     "minimal": [
         "test_chunked_prefill.py",
         "test_embedding_openai_server.py",
-        "test_eval_accuracy.py",
+        "test_eval_accuracy_mini.py",
         "test_large_max_new_tokens.py",
         "test_openai_server.py",
         "test_skip_tokenizer_init.py",
diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py
index 3a9423bc5..5b2bb4aaa 100644
--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -10,34 +10,41 @@ from sglang.test.test_utils import (
 )
 
 
-class TestAccuracy(unittest.TestCase):
+class TestChunkedPrefill(unittest.TestCase):
 
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
+    def run_mmlu(self, disable_radix_cache):
+        other_args = ["--chunked-prefill-size", "32"]
+        if disable_radix_cache:
+            other_args += ["--disable-radix-cache"]
+
+        model = DEFAULT_MODEL_NAME_FOR_TEST
+        base_url = DEFAULT_URL_FOR_TEST
+        process = popen_launch_server(
+            model,
+            base_url,
             timeout=300,
-            other_args=["--chunked-prefill-size", "32"],
+            other_args=other_args,
         )
 
-    @classmethod
-    def tearDownClass(cls):
-        kill_child_process(cls.process.pid)
-
-    def test_mmlu(self):
         args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
+            base_url=base_url,
+            model=model,
             eval_name="mmlu",
-            num_examples=20,
-            num_threads=20,
+            num_examples=32,
+            num_threads=32,
         )
 
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        try:
+            metrics = run_eval(args)
+            assert metrics["score"] >= 0.6
+        finally:
+            kill_child_process(process.pid)
+
+    def test_chunked_prefill(self):
+        self.run_mmlu(disable_radix_cache=False)
+
+    def test_chunked_prefill_without_radix_cache(self):
+        self.run_mmlu(disable_radix_cache=True)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py
new file mode 100644
index 000000000..84a60dbe9
--- /dev/null
+++ b/test/srt/test_eval_accuracy_large.py
@@ -0,0 +1,68 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class TestEvalAccuracyLarge(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
+        cls.base_url = "http://127.0.0.1:7157"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=300,
+            other_args=["--log-level-http", "warning"],
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=None,
+            num_threads=2048,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.70
+
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=2048,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.65
+
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=2048,
+        )
+
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.85
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy_mini.py
similarity index 85%
rename from test/srt/test_eval_accuracy.py
rename to test/srt/test_eval_accuracy_mini.py
index a3f16f857..b5533da37 100644
--- a/test/srt/test_eval_accuracy.py
+++ b/test/srt/test_eval_accuracy_mini.py
@@ -10,7 +10,7 @@ from sglang.test.test_utils import (
 )
 
 
-class TestAccuracy(unittest.TestCase):
+class TestEvalAccuracyMini(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -27,12 +27,12 @@ class TestAccuracy(unittest.TestCase):
             base_url=self.base_url,
             model=self.model,
             eval_name="mmlu",
-            num_examples=20,
-            num_threads=20,
+            num_examples=32,
+            num_threads=32,
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        assert metrics["score"] >= 0.6
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py
index 808bc833e..25b07d881 100644
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -1,3 +1,4 @@
+import os
 import unittest
 from types import SimpleNamespace
 
@@ -55,21 +56,30 @@ class TestServingThroughput(unittest.TestCase):
             kill_child_process(process.pid)
 
         assert res["completed"] == num_prompts
+        return res
 
     def test_default(self):
-        self.run_test(
+        res = self.run_test(
             disable_radix_cache=False,
             disable_flashinfer=False,
             chunked_prefill_size=-1,
         )
 
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 performance
+            assert res["output_throughput"] >= 1300
+
     def test_default_without_radix_cache(self):
-        self.run_test(
+        res = self.run_test(
             disable_radix_cache=True,
             disable_flashinfer=False,
             chunked_prefill_size=-1,
         )
 
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            # A100 performance
+            assert res["output_throughput"] >= 1400
+
     def test_default_without_flashinfer(self):
         self.run_test(
             disable_radix_cache=False,
diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py
index c8869a9cc..1ea1438fe 100644
--- a/test/srt/test_torch_compile.py
+++ b/test/srt/test_torch_compile.py
@@ -10,7 +10,7 @@ from sglang.test.test_utils import (
 )
 
 
-class TestAccuracy(unittest.TestCase):
+class TestTorchCompile(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -29,12 +29,12 @@ class TestAccuracy(unittest.TestCase):
             base_url=self.base_url,
             model=self.model,
             eval_name="mmlu",
-            num_examples=20,
-            num_threads=20,
+            num_examples=32,
+            num_threads=32,
         )
 
         metrics = run_eval(args)
-        assert metrics["score"] >= 0.5
+        assert metrics["score"] >= 0.6
 
 
 if __name__ == "__main__":