From a4331cd260c969ff08a0dbd7465c9b5d87b472b6 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 21 Jan 2025 02:55:14 -0800 Subject: [PATCH] Add accuracy and latency tests of eagle into CI (#3027) --- .github/workflows/pr-test.yml | 18 ++- python/sglang/test/test_utils.py | 6 +- test/srt/models/test_qwen_models.py | 6 +- test/srt/test_bench_one_batch.py | 26 +++- test/srt/test_bench_serving.py | 34 ++++- test/srt/test_eagle_infer.py | 217 ++++++++++++++-------------- test/srt/test_torch_compile.py | 2 +- 7 files changed, 186 insertions(+), 123 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 8b8d7c56e..c5eeeee3c 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -128,7 +128,7 @@ jobs: timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_default + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1 - name: Benchmark online latency timeout-minutes: 10 @@ -148,6 +148,13 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size + - name: Benchmark online latency (EAGLE) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + + performance-test-1-gpu-part-2: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner @@ -196,7 +203,13 @@ jobs: timeout-minutes: 10 run: | cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + + - name: Benchmark single latency + torch.compile (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 - name: Benchmark offline throughput (TP=2) timeout-minutes: 10 @@ -210,6 +223,7 @@ jobs: cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache + accuracy-test-1-gpu: if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' runs-on: 1-gpu-runner diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index ad8ff6cbf..ee5ae278d 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct" +DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf" +DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B" + def is_in_ci(): """Return whether it is in CI runner.""" @@ -538,6 +541,7 @@ def run_bench_serving( random_input_len=4096, random_output_len=2048, disable_stream=False, + disable_ignore_eos=False, need_warmup=False, ): # Launch the server @@ -572,7 +576,7 @@ def run_bench_serving( disable_stream=disable_stream, return_logprob=False, seed=0, - disable_ignore_eos=False, + disable_ignore_eos=disable_ignore_eos, extra_request_body=None, apply_chat_template=False, profile=None, diff --git a/test/srt/models/test_qwen_models.py b/test/srt/models/test_qwen_models.py index 9e61930a7..c7788fa8e 100644 --- a/test/srt/models/test_qwen_models.py +++ b/test/srt/models/test_qwen_models.py @@ -37,8 +37,7 @@ class TestQwen2(unittest.TestCase): port=int(self.base_url.split(":")[-1]), ) metrics = run_eval(args) - print(metrics) - + print(f"{metrics=}") self.assertGreater(metrics["accuracy"], 0.81) @@ -69,8 +68,7 @@ class TestQwen2FP8(unittest.TestCase): port=int(self.base_url.split(":")[-1]), ) metrics = run_eval(args) - print(metrics) - + print(f"{metrics=}") self.assertGreater(metrics["accuracy"], 0.79) diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index c1bc98e8e..c6562170d 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -5,24 +5,46 @@ from sglang.test.test_utils import ( DEFAULT_MOE_MODEL_NAME_FOR_TEST, is_in_ci, run_bench_one_batch, + write_github_step_summary, ) class TestBenchOneBatch(unittest.TestCase): - def test_default(self): + def test_bs1(self): output_throughput = run_bench_one_batch(DEFAULT_MODEL_NAME_FOR_TEST, []) if is_in_ci(): + write_github_step_summary( + f"### test_bs1\n" + f"output_throughput : {output_throughput:.2f} token/s\n" + ) self.assertGreater(output_throughput, 135) - def test_moe_default(self): + def test_moe_tp2_bs1(self): output_throughput = run_bench_one_batch( DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2"] ) if is_in_ci(): + write_github_step_summary( + f"### test_moe_tp2_bs1\n" + f"output_throughput : {output_throughput:.2f} token/s\n" + ) self.assertGreater(output_throughput, 125) + def test_torch_compile_tp2_bs1(self): + output_throughput = run_bench_one_batch( + DEFAULT_MODEL_NAME_FOR_TEST, + ["--tp", "2", "--enable-torch-compile", "--cuda-graph-max-bs", "2"], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_torch_compile_tp2_bs1\n" + f"output_throughput : {output_throughput:.2f} token/s\n" + ) + self.assertGreater(output_throughput, 240) + if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index b882f12f9..b55260f71 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -1,6 +1,8 @@ import unittest from sglang.test.test_utils import ( + DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, + DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_FP8_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST, @@ -47,7 +49,7 @@ class TestBenchServing(unittest.TestCase): ) # There is a regression with torch 2.5 # This number was 950 for torch 2.4 - self.assertGreater(res["output_throughput"], 800) + self.assertGreater(res["output_throughput"], 850) def test_offline_throughput_without_radix_cache(self): res = run_bench_serving( @@ -131,6 +133,36 @@ class TestBenchServing(unittest.TestCase): self.assertLess(res["median_ttft_ms"], 86) self.assertLess(res["median_itl_ms"], 10) + def test_online_latency_eagle(self): + res = run_bench_serving( + model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, + num_prompts=50, + request_rate=1, + disable_ignore_eos=True, + dataset_name="sharegpt", + other_server_args=[ + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, + "--speculative-num-steps", + "5", + "--speculative-eagle-topk", + "8", + "--speculative-num-draft-tokens", + "64", + "--mem-fraction-static", + "0.7", + ], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_online_latency_eagle\n" + f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n' + ) + self.assertLess(res["median_e2e_latency_ms"], 10000) + def test_moe_offline_throughput_default(self): res = run_bench_serving( model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, diff --git a/test/srt/test_eagle_infer.py b/test/srt/test_eagle_infer.py index 92127b8ef..b01c26049 100644 --- a/test/srt/test_eagle_infer.py +++ b/test/srt/test_eagle_infer.py @@ -1,14 +1,18 @@ -import multiprocessing import random +import threading import time import unittest +from types import SimpleNamespace import requests -from transformers import AutoConfig, AutoTokenizer import sglang as sgl +from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval from sglang.test.test_utils import ( + DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, + DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, popen_launch_server, @@ -19,60 +23,59 @@ class TestEAGLEEngine(unittest.TestCase): def test_eagle_accuracy(self): prompt = "Today is a sunny day and I like" - target_model_path = "meta-llama/Llama-2-7b-chat-hf" - speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B" - sampling_params = {"temperature": 0, "max_new_tokens": 8} + # Get the reference output + ref_engine = sgl.Engine(model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST) + ref_output = ref_engine.generate(prompt, sampling_params)["text"] + ref_engine.shutdown() + + # Launch EAGLE engine engine = sgl.Engine( - model_path=target_model_path, - speculative_draft_model_path=speculative_draft_model_path, + model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, + speculative_draft_model_path=DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, speculative_algorithm="EAGLE", - speculative_num_steps=3, - speculative_eagle_topk=4, - speculative_num_draft_tokens=16, + speculative_num_steps=5, + speculative_eagle_topk=8, + speculative_num_draft_tokens=64, + mem_fraction_static=0.7, ) + + # Case 1: Test the output of EAGLE engine is the same as normal engine out1 = engine.generate(prompt, sampling_params)["text"] - engine.shutdown() + print(f"{out1=}, {ref_output=}") + self.assertEqual(out1, ref_output) - engine = sgl.Engine(model_path=target_model_path) - out2 = engine.generate(prompt, sampling_params)["text"] - engine.shutdown() - - print("==== Answer 1 ====") - print(out1) - - print("==== Answer 2 ====") - print(out2) - self.assertEqual(out1, out2) - - def test_eagle_end_check(self): + # Case 2: Test the output of EAGLE engine does not contain unexpected EOS prompt = "[INST] <>\\nYou are a helpful assistant.\\n<>\\nToday is a sunny day and I like [/INST]" - target_model_path = "meta-llama/Llama-2-7b-chat-hf" - tokenizer = AutoTokenizer.from_pretrained(target_model_path) - speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B" - sampling_params = { "temperature": 0, "max_new_tokens": 1024, "skip_special_tokens": False, } - engine = sgl.Engine( - model_path=target_model_path, - speculative_draft_model_path=speculative_draft_model_path, - speculative_algorithm="EAGLE", - speculative_num_steps=3, - speculative_eagle_topk=4, - speculative_num_draft_tokens=16, - ) - out1 = engine.generate(prompt, sampling_params)["text"] - engine.shutdown() - print("==== Answer 1 ====") - print(repr(out1)) - tokens = tokenizer.encode(out1, truncation=False) + tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST) + out2 = engine.generate(prompt, sampling_params)["text"] + print(f"{out2=}") + tokens = tokenizer.encode(out2, truncation=False) assert tokenizer.eos_token_id not in tokens + # Case 3: Batched prompts + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = {"temperature": 0, "max_new_tokens": 30} + outputs = engine.generate(prompts, sampling_params) + for prompt, output in zip(prompts, outputs): + print("===============================") + print(f"Prompt: {prompt}\nGenerated text: {output['text']}") + + # Shutdown the engine + engine.shutdown() + prompts = [ "[INST] <>\\nYou are a helpful assistant.\\n<>\\nToday is a sunny day and I like[/INST]" @@ -83,64 +86,27 @@ prompts = [ ] -def process(server_url: str): - time.sleep(random.uniform(0, 2)) - for prompt in prompts: - url = server_url - data = { - "model": "base", - "text": prompt, - "sampling_params": { - "temperature": 0, - "max_new_tokens": 1024, - }, - } - response = requests.post(url, json=data) - assert response.status_code == 200 - - -def abort_process(server_url: str): - for prompt in prompts: - try: - time.sleep(1) - url = server_url - data = { - "model": "base", - "text": prompt, - "sampling_params": { - "temperature": 0, - "max_new_tokens": 1024, - }, - } - # set timeout = 1s,mock disconnected - requests.post(url, json=data, timeout=1) - except: - pass - - -class TestEAGLELaunchServer(unittest.TestCase): +class TestEAGLEServer(unittest.TestCase): @classmethod def setUpClass(cls): - speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B" - cls.model = "meta-llama/Llama-2-7b-chat-hf" cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( - cls.model, + DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--speculative-algorithm", "EAGLE", "--speculative-draft-model-path", - speculative_draft_model_path, + DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, "--speculative-num-steps", - "3", + "5", "--speculative-eagle-topk", - "4", + "8", "--speculative-num-draft-tokens", - "16", - "--served-model-name", - "base", + "64", + "--mem-fraction-static", + "0.7", ], ) @@ -148,39 +114,66 @@ class TestEAGLELaunchServer(unittest.TestCase): def tearDownClass(cls): kill_process_tree(cls.process.pid) - def test_eagle_server_concurrency(self): + def send_request(self): + time.sleep(random.uniform(0, 2)) + for prompt in prompts: + url = self.base_url + "/generate" + data = { + "text": prompt, + "sampling_params": { + "temperature": 0, + "max_new_tokens": 1024, + }, + } + response = requests.post(url, json=data) + assert response.status_code == 200 + + def send_requests_abort(self): + for prompt in prompts: + try: + time.sleep(random.uniform(0, 2)) + url = self.base_url + "/generate" + data = { + "model": "base", + "text": prompt, + "sampling_params": { + "temperature": 0, + "max_new_tokens": 1024, + }, + } + # set timeout = 1s,mock disconnected + requests.post(url, json=data, timeout=1) + except Exception as e: + print(e) + pass + + def test_request_abort(self): concurrency = 4 - processes = [ - multiprocessing.Process( - target=process, - kwargs={"server_url": self.base_url + "/generate"}, - ) + threads = [ + threading.Thread(target=self.send_request) for _ in range(concurrency) + ] + [ + threading.Thread(target=self.send_requests_abort) for _ in range(concurrency) ] - for worker in processes: + for worker in threads: worker.start() - for p in processes: + for p in threads: p.join() - def test_eagle_server_request_abort(self): - concurrency = 4 - processes = [ - multiprocessing.Process( - target=process, - kwargs={"server_url": self.base_url + "/generate"}, - ) - for _ in range(concurrency) - ] + [ - multiprocessing.Process( - target=abort_process, - kwargs={"server_url": self.base_url + "/generate"}, - ) - for _ in range(concurrency) - ] - for worker in processes: - worker.start() - for p in processes: - p.join() + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval(args) + print(f"{metrics=}") + + self.assertGreater(metrics["accuracy"], 0.20) if __name__ == "__main__": diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 6f3b344b3..e71de3391 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -23,7 +23,7 @@ class TestTorchCompile(unittest.TestCase): cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--enable-torch-compile"], + other_args=["--enable-torch-compile", "--cuda-graph-max-bs", "4"], ) @classmethod