diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml new file mode 100644 index 000000000..9c8e7bfeb --- /dev/null +++ b/.github/workflows/accuracy-test.yml @@ -0,0 +1,45 @@ +name: Accuracy Test + +on: + push: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + pull_request: + branches: [ main ] + paths: + - "python/sglang/**" + - "test/**" + workflow_dispatch: + +concurrency: + group: accuracy-test-${{ github.ref }} + cancel-in-progress: true + +jobs: + accuracy-test: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: accuracy + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies + run: | + source $HOME/venv/bin/activate + echo "$HOME/venv/bin" >> $GITHUB_PATH + + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall + + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . + + - name: Evaluate Accuracy + run: | + cd test/srt + python3 test_eval_accuracy_large.py diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 78ac4d9ec..336f6a14f 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -20,7 +20,7 @@ concurrency: jobs: e2e-test: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' - runs-on: bench + runs-on: e2e steps: - name: Checkout code diff --git a/README.md b/README.md index f81593ef6..1d7ff009b 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance. - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size. ``` -python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 2048 +python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096 ``` - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port. ``` diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index e3a2ad0a2..0f9c88223 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -669,19 +669,20 @@ async def benchmark( "backend": args.backend, "dataset_name": args.dataset_name, "request_rate": request_rate, - "total_input": metrics.total_input, - "total_output": metrics.total_output, - "total_output_retokenized": metrics.total_output_retokenized, - "mean_e2e_latency": metrics.mean_e2e_latency_ms, - "median_e2e_latency": metrics.median_e2e_latency_ms, - "median_ttft": metrics.median_ttft_ms, - "median_itl": metrics.median_itl_ms, - "output_token_throughput": metrics.output_throughput, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "total_output_tokens_retokenized": metrics.total_output_retokenized, + "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms, + "median_e2e_latency_ms": metrics.median_e2e_latency_ms, + "median_ttft_ms": metrics.median_ttft_ms, + "median_itl_ms": metrics.median_itl_ms, + "output_throughput": metrics.output_throughput, "sharegpt_output_len": args.sharegpt_output_len, "random_input_len": args.random_input_len, "random_output_len": args.random_output_len, "random_range_ratio": args.random_range_ratio, - "benchmark_duration": benchmark_duration, + "duration": benchmark_duration, + "completed": metrics.completed, } else: print(f"Error running benchmark for request rate: {request_rate}") diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index 4c757737e..f14885263 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -64,8 +64,7 @@ from sglang.utils import get_exception_traceback logger = logging.getLogger(__name__) -# TODO: Rename "CI" to "SGLANG_IS_IN_CI". -crash_on_warning = os.getenv("CI", "false") == "true" +crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true" class ModelTpServer: diff --git a/python/sglang/test/run_eval.py b/python/sglang/test/run_eval.py index 6c1f284b1..3d13d475b 100644 --- a/python/sglang/test/run_eval.py +++ b/python/sglang/test/run_eval.py @@ -39,6 +39,14 @@ def run_eval(args): eval_obj = MathEval( filename, equality_checker, args.num_examples, args.num_threads ) + elif args.eval_name == "mgsm": + from sglang.test.simple_eval_mgsm import MGSMEval + + eval_obj = MGSMEval(args.num_examples, args.num_threads) + elif args.eval_name == "mgsm_en": + from sglang.test.simple_eval_mgsm import MGSMEval + + eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"]) elif args.eval_name == "gpqa": from sglang.test.simple_eval_gpqa import GPQAEval diff --git a/python/sglang/test/simple_eval_mgsm.py b/python/sglang/test/simple_eval_mgsm.py new file mode 100644 index 000000000..ce00a1ac7 --- /dev/null +++ b/python/sglang/test/simple_eval_mgsm.py @@ -0,0 +1,203 @@ +# Adapted from https://github.com/openai/simple-evals/ + +""" +MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems. +Language Models are Multilingual Chain-of-Thought Reasoners +Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei +https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp +""" + +import re +import urllib +from typing import Optional + +from sglang.test import simple_eval_common as common +from sglang.test.simple_eval_common import ( + HTML_JINJA, + Eval, + EvalResult, + SamplerBase, + SingleEvalResult, +) + +ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"] +LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"] +NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"] + +LANG_TO_FPATH = { + "bn": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_bn.tsv", + "de": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_de.tsv", + "en": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_en.tsv", + "es": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_es.tsv", + "fr": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_fr.tsv", + "ja": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ja.tsv", + "ru": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ru.tsv", + "sw": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_sw.tsv", + "te": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_te.tsv", + "th": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_th.tsv", + "zh": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_zh.tsv", +} +LANG_TO_INSTRUCTIONS = { + "en": """Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:". + +{input}""", + "bn": """এই গণিতের সমস্যাটি সমাধান করুন। চূড়ান্ত উত্তর দেওয়ার আগে যুক্তিসম্পন্ন পদক্ষেপ প্রদান করুন। চূড়ান্ত উত্তরটি একক সংখ্যা হিসাবে "উত্তর:" এর পরে শেষ লাইনে দিন। "উত্তর:" এর পরে অন্য কিছু যুক্ত করবেন না।. + +{input}""", + "de": """Löse dieses Mathematikproblem. Gib die Schritte zur Begründung an, bevor du die endgültige Antwort in der letzten Zeile alleine im Format "Antwort:" gibst. Füge nichts anderes als die ganzzahlige Antwort nach "Antwort:" hinzu. + +{input}""", + "es": """Resuelve este problema matemático. Proporciona los pasos de razonamiento antes de dar la respuesta final en la última línea por sí misma en el formato de "Respuesta:". No añadas nada más que la respuesta entera después de "Respuesta:". + +{input}""", + "fr": """Résolvez ce problème de mathématiques. Donnez les étapes de raisonnement avant de fournir la réponse finale sur la dernière ligne elle-même dans le format de "Réponse:". N'ajoutez rien d'autre que la réponse entière après "Réponse:". + +{input}""", + "ja": """の数学の問題を解いてください。最終的な答えを出す前に、解答の推論過程を記述してください。そして最後の行には "答え:" の形式で答えを記述し、その後には整数の答え以外何も追加しないでください。 + +{input}""", + "ru": """Решите эту математическую задачу. Объясните шаги рассуждения перед тем, как дать окончательный ответ в последней строке сам по себе в формате "Ответ:". Не добавляйте ничего, кроме целочисленного ответа после "Ответ:". + +{input}""", + "sw": """Suluhisha tatizo hili la hesabu. Toa hatua za mantiki kabla ya kutoa jibu la mwisho kwenye mstari wa mwisho peke yake katika muundo wa "Jibu:". Usiongeze chochote kingine isipokuwa jibu la integer baada ya "Jibu:". + +{input}""", + "te": """ఈ గణిత సమస్యను పరిష్కరించండి. చివరి సమాధానాన్ని ఇవ్వదానికి ముందు తర్కాత్మక అదుగులను ఇవ్వండి. చివరి పంక్తిలో మాత్రమే 'సమాధానం:' అనే ఆకారంలో చివరి సమాధానాద్ని ఇవ్వండి సమాధానం: తర్వాత పూర్ణాంక సమాధానానికి తప్పించి ఎదేనా చేర్చవద్దు. + +{input}""", + "th": """แก้ปัญหาคณิตศาสตร์นี้ ให้ให้ขั้นตอนการใช้เหตุผลก่อนที่จะให้คำตอบสุดท้ายในบรรทัดสุดท้ายโดยอยู่ในรูปแบบ "คำตอบ:" ไม่ควรเพิ่มอะไรนอกจากคำตอบที่เป็นจำนวนเต็มหลังจาก "คำตอบ:" + +{input}""", + "zh": """解决这个数学问题。在最后一行给出答案前,请提供推理步骤。最后一行应该以 "答案: " 的形式独立给出答案。在 "答案:" 后不要添加除整数答案之外的任何内容。 + +{input}""", +} + +LANG_TO_ANSWER_PREFIX = { + "en": "Answer", + "bn": "উত্তর", + "de": "Antwort", + "es": "Respuesta", + "fr": "Réponse", + "ja": "答え", + "ru": "Ответ", + "sw": "Jibu", + "te": "సమాధానం", + "th": "คำตอบ", + "zh": "答案", +} + + +def parse_answer(answer: str, answer_prefix: str) -> str: + if answer_prefix not in answer: + return "" + + answer_text = answer.split(answer_prefix)[-1].strip() + + # find all the numbers (including decimals) in the string + numbers = re.findall(r"\d+\.?\d*", answer_text.replace(",", "")) + + # return the first number (removing trailing decimal point if present), + # or an empty string if there were no numbers + return numbers[-1].rstrip(".") if numbers else "" + + +def score_mgsm(target: str, prediction: str) -> bool: + if "." in prediction: + prediction = prediction.rstrip("0").rstrip(".") + + target = target.replace(",", "") + prediction = prediction.replace(",", "") + + return target == prediction + + +def get_lang_examples(lang: str) -> list[dict[str, str]]: + fpath = LANG_TO_FPATH[lang] + examples = [] + with urllib.request.urlopen(fpath) as f: + for line in f.read().decode("utf-8").splitlines(): + inputs, targets = line.strip().split("\t") + if "." in targets: + raise ValueError(f"targets {targets} contains a decimal point.") + # targets = int(targets.replace(",", "")) + examples.append({"inputs": inputs, "targets": targets, "lang": lang}) + return examples + + +def get_all_examples() -> list[dict[str, str]]: + examples = [] + for lang in ALL_LANGUAGES: + if lang != "en": + continue + examples += get_lang_examples(lang) + return examples + + +class MGSMEval(Eval): + def __init__( + self, + num_examples_per_lang: int = 250, # restrict to a subset of the data for debugging + num_threads: int = 64, + languages: Optional[list[str]] = ALL_LANGUAGES, + ): + if languages is None: + languages = ALL_LANGUAGES + else: + for language in languages: + if language not in ALL_LANGUAGES: + raise ValueError( + f"language {language} is not a valid language. " + f"It should be one in {ALL_LANGUAGES}" + ) + self._languages = languages + self._num_examples_per_lang = num_examples_per_lang + self._num_threads = num_threads + + examples = [] + for lang in self._languages: + lang_examples = get_lang_examples(lang) + examples.extend(lang_examples[: self._num_examples_per_lang]) + self.examples = examples + + def __call__(self, sampler: SamplerBase) -> EvalResult: + def fn(example: dict[str, str]): + language = example["lang"] + latin_language = ( + "group_latin" if language in LATIN_LANGUAGES else "group_non_latin" + ) + correct_answer = example["targets"] + instructoin = LANG_TO_INSTRUCTIONS[language] + prompt_messages = [ + sampler._pack_message( + content=instructoin.format(input=example["inputs"]), role="user" + ) + ] + try: + response_text = sampler(prompt_messages) + except Exception as e: + response_text = "" + + answer_prefix = LANG_TO_ANSWER_PREFIX[language] + extracted_answer = parse_answer(response_text, answer_prefix) + + score = score_mgsm(correct_answer, extracted_answer) + html = common.jinja_env.from_string(HTML_JINJA).render( + prompt_messages=prompt_messages, + next_message=dict(content=response_text, role="assistant"), + score=score, + correct_answer=correct_answer, + extracted_answer=extracted_answer, + ) + convo = prompt_messages + [dict(content=response_text, role="assistant")] + return SingleEvalResult( + html=html, + score=score, + convo=convo, + metrics={language: score, latin_language: score}, + ) + + results = common.map_with_progress( + fn, self.examples, num_threads=self._num_threads + ) + return common.aggregate_results(results, default_stats=("mean", "std")) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 08122389f..c99b6a60b 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -7,7 +7,7 @@ suites = { "minimal": [ "test_chunked_prefill.py", "test_embedding_openai_server.py", - "test_eval_accuracy.py", + "test_eval_accuracy_mini.py", "test_large_max_new_tokens.py", "test_openai_server.py", "test_skip_tokenizer_init.py", diff --git a/test/srt/test_chunked_prefill.py b/test/srt/test_chunked_prefill.py index 3a9423bc5..5b2bb4aaa 100644 --- a/test/srt/test_chunked_prefill.py +++ b/test/srt/test_chunked_prefill.py @@ -10,34 +10,41 @@ from sglang.test.test_utils import ( ) -class TestAccuracy(unittest.TestCase): +class TestChunkedPrefill(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, + def run_mmlu(self, disable_radix_cache): + other_args = ["--chunked-prefill-size", "32"] + if disable_radix_cache: + other_args += ["--disable-radix-cache"] + + model = DEFAULT_MODEL_NAME_FOR_TEST + base_url = DEFAULT_URL_FOR_TEST + process = popen_launch_server( + model, + base_url, timeout=300, - other_args=["--chunked-prefill-size", "32"], + other_args=other_args, ) - @classmethod - def tearDownClass(cls): - kill_child_process(cls.process.pid) - - def test_mmlu(self): args = SimpleNamespace( - base_url=self.base_url, - model=self.model, + base_url=base_url, + model=model, eval_name="mmlu", - num_examples=20, - num_threads=20, + num_examples=32, + num_threads=32, ) - metrics = run_eval(args) - assert metrics["score"] >= 0.5 + try: + metrics = run_eval(args) + assert metrics["score"] >= 0.6 + finally: + kill_child_process(process.pid) + + def test_chunked_prefill(self): + self.run_mmlu(disable_radix_cache=False) + + def test_chunked_prefill_without_radix_cache(self): + self.run_mmlu(disable_radix_cache=True) if __name__ == "__main__": diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py new file mode 100644 index 000000000..84a60dbe9 --- /dev/null +++ b/test/srt/test_eval_accuracy_large.py @@ -0,0 +1,68 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalAccuracyLarge(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST + cls.base_url = "http://127.0.0.1:7157" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=300, + other_args=["--log-level-http", "warning"], + ) + + @classmethod + def tearDownClass(cls): + kill_child_process(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=None, + num_threads=2048, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.70 + + def test_human_eval(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="humaneval", + num_examples=None, + num_threads=2048, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.65 + + def test_mgsm_en(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mgsm_en", + num_examples=None, + num_threads=2048, + ) + + metrics = run_eval(args) + assert metrics["score"] >= 0.85 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_eval_accuracy.py b/test/srt/test_eval_accuracy_mini.py similarity index 85% rename from test/srt/test_eval_accuracy.py rename to test/srt/test_eval_accuracy_mini.py index a3f16f857..b5533da37 100644 --- a/test/srt/test_eval_accuracy.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -10,7 +10,7 @@ from sglang.test.test_utils import ( ) -class TestAccuracy(unittest.TestCase): +class TestEvalAccuracyMini(unittest.TestCase): @classmethod def setUpClass(cls): @@ -27,12 +27,12 @@ class TestAccuracy(unittest.TestCase): base_url=self.base_url, model=self.model, eval_name="mmlu", - num_examples=20, - num_threads=20, + num_examples=32, + num_threads=32, ) metrics = run_eval(args) - assert metrics["score"] >= 0.5 + assert metrics["score"] >= 0.6 if __name__ == "__main__": diff --git a/test/srt/test_serving_throughput.py b/test/srt/test_serving_throughput.py index 808bc833e..25b07d881 100644 --- a/test/srt/test_serving_throughput.py +++ b/test/srt/test_serving_throughput.py @@ -1,3 +1,4 @@ +import os import unittest from types import SimpleNamespace @@ -55,21 +56,30 @@ class TestServingThroughput(unittest.TestCase): kill_child_process(process.pid) assert res["completed"] == num_prompts + return res def test_default(self): - self.run_test( + res = self.run_test( disable_radix_cache=False, disable_flashinfer=False, chunked_prefill_size=-1, ) + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 performance + assert res["output_throughput"] >= 1300 + def test_default_without_radix_cache(self): - self.run_test( + res = self.run_test( disable_radix_cache=True, disable_flashinfer=False, chunked_prefill_size=-1, ) + if os.getenv("SGLANG_IS_IN_CI", "false") == "true": + # A100 performance + assert res["output_throughput"] >= 1400 + def test_default_without_flashinfer(self): self.run_test( disable_radix_cache=False, diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index c8869a9cc..1ea1438fe 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -10,7 +10,7 @@ from sglang.test.test_utils import ( ) -class TestAccuracy(unittest.TestCase): +class TestTorchCompile(unittest.TestCase): @classmethod def setUpClass(cls): @@ -29,12 +29,12 @@ class TestAccuracy(unittest.TestCase): base_url=self.base_url, model=self.model, eval_name="mmlu", - num_examples=20, - num_threads=20, + num_examples=32, + num_threads=32, ) metrics = run_eval(args) - assert metrics["score"] >= 0.5 + assert metrics["score"] >= 0.6 if __name__ == "__main__":