From 2d1b83e57ace3ca33534913253936092e50b2e75 Mon Sep 17 00:00:00 2001 From: HandH1998 <1335248067@qq.com> Date: Tue, 25 Mar 2025 12:57:58 +0800 Subject: [PATCH] add dsv3 int8 test (#4705) --- test/srt/run_suite.py | 1 + test/srt/test_mla_int8_deepseek_v3.py | 209 ++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 test/srt/test_mla_int8_deepseek_v3.py diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 24978502b..43bf46c03 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -41,6 +41,7 @@ suites = { TestFile("test_metrics.py", 32), TestFile("test_mla.py", 92), TestFile("test_mla_deepseek_v3.py", 221), + TestFile("test_mla_int8_deepseek_v3.py", 421), TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_fp8.py", 93), TestFile("test_no_chunked_prefill.py", 126), diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py new file mode 100644 index 000000000..3fda82139 --- /dev/null +++ b/test/srt/test_mla_int8_deepseek_v3.py @@ -0,0 +1,209 @@ +import unittest +from types import SimpleNamespace + +import requests +import torch + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestMLADeepseekV3ChannelInt8(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = "sgl-project/sglang-ci-dsv3-channel-int8-test" + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = ["--trust-remote-code"] + if torch.cuda.is_available() and torch.version.cuda: + other_args.extend(["--enable-torch-compile", "--cuda-graph-max-bs", "2"]) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + +class TestDeepseekV3MTPChannelInt8(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = "sgl-project/sglang-ci-dsv3-channel-int8-test" + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = ["--trust-remote-code"] + if torch.cuda.is_available() and torch.version.cuda: + other_args.extend( + [ + "--cuda-graph-max-bs", + "2", + "--disable-radix", + "--enable-torch-compile", + "--torch-compile-max-bs", + "1", + "--speculative-algorithm", + "EAGLE", + "--speculative-draft", + "sgl-project/sglang-ci-dsv3-channel-int8-test-NextN", + "--speculative-num-steps", + "2", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "4", + ] + ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.60) + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] + print(f"{avg_spec_accept_length=}") + self.assertGreater(avg_spec_accept_length, 2.5) + + +class TestMLADeepseekV3BlockInt8(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = "sgl-project/sglang-ci-dsv3-block-int8-test" + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = ["--trust-remote-code"] + if torch.cuda.is_available() and torch.version.cuda: + other_args.extend(["--enable-torch-compile", "--cuda-graph-max-bs", "2"]) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + +class TestDeepseekV3MTPBlockInt8(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = "sgl-project/sglang-ci-dsv3-block-int8-test" + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = ["--trust-remote-code"] + if torch.cuda.is_available() and torch.version.cuda: + other_args.extend( + [ + "--cuda-graph-max-bs", + "2", + "--disable-radix", + "--enable-torch-compile", + "--torch-compile-max-bs", + "1", + "--speculative-algorithm", + "EAGLE", + "--speculative-draft", + "sgl-project/sglang-ci-dsv3-block-int8-test-NextN", + "--speculative-num-steps", + "2", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "4", + ] + ) + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.60) + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] + print(f"{avg_spec_accept_length=}") + self.assertGreater(avg_spec_accept_length, 2.5) + + +if __name__ == "__main__": + unittest.main()