From 2ac189edc894412cee4dfa50a50ca69685d7752d Mon Sep 17 00:00:00 2001 From: HandH1998 <1335248067@qq.com> Date: Tue, 11 Mar 2025 01:12:09 +0800 Subject: [PATCH] Amd test fp8 (#4261) --- .github/workflows/pr-test-amd.yml | 1 + python/sglang/srt/configs/model_config.py | 1 + .../srt/layers/quantization/fp8_utils.py | 4 + python/sglang/test/test_utils.py | 4 + test/srt/run_suite.py | 1 + test/srt/test_eval_fp8_accuracy.py | 73 +++++++++++++++++++ 6 files changed, 84 insertions(+) create mode 100644 test/srt/test_eval_fp8_accuracy.py diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index a4aa39ad2..f21eaba0b 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -55,6 +55,7 @@ jobs: timeout-minutes: 20 run: | docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_accuracy_large.py + docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_eval_fp8_accuracy.py docker exec -w /sglang-checkout/test/srt ci_sglang python3 models/test_qwen_models.py mla-test-1-gpu-amd: diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py index 489cc6d4b..028a4519a 100644 --- a/python/sglang/srt/configs/model_config.py +++ b/python/sglang/srt/configs/model_config.py @@ -237,6 +237,7 @@ class ModelConfig: "compressed_tensors", "compressed-tensors", "fbgemm_fp8", + "w8a8_fp8", ] optimized_quantization_methods = [ "fp8", diff --git a/python/sglang/srt/layers/quantization/fp8_utils.py b/python/sglang/srt/layers/quantization/fp8_utils.py index e53b971be..b7b2f2b89 100644 --- a/python/sglang/srt/layers/quantization/fp8_utils.py +++ b/python/sglang/srt/layers/quantization/fp8_utils.py @@ -32,6 +32,10 @@ if _is_cuda: else: from sgl_kernel import fp8_scaled_mm +# Input scaling factors are no longer optional in _scaled_mm starting +# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale +TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32) + def cutlass_fp8_supported(): if not _is_cuda: diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index d9964cd50..3fe5d2a0f 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -28,6 +28,10 @@ from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" +DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = ( + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" +) DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 413cef32c..d387fd710 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -69,6 +69,7 @@ suites = { TestFile("test_vision_llm.py", 18.4), TestFile("test_vision_openai_server.py", 344), TestFile("test_w8a8_quantization.py", 46), + TestFile("test_eval_fp8_accuracy.py", 172), ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), diff --git a/test/srt/test_eval_fp8_accuracy.py b/test/srt/test_eval_fp8_accuracy.py new file mode 100644 index 000000000..8d3c5c00c --- /dev/null +++ b/test/srt/test_eval_fp8_accuracy.py @@ -0,0 +1,73 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.test_utils import ( + DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST, + DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class TestEvalFP8Accuracy(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + temperature=0.1, + ) + + metrics = run_eval(args) + self.assertGreaterEqual(metrics["score"], 0.62) + + +class TestEvalFP8DynamicQuantAccuracy(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=["--quantization", "w8a8_fp8"], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_mmlu(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="mmlu", + num_examples=64, + num_threads=32, + temperature=0.1, + ) + + metrics = run_eval(args) + self.assertGreaterEqual(metrics["score"], 0.70) + + +if __name__ == "__main__": + unittest.main()