[CI] Ascend NPU CI enhancement (#8294)

Co-authored-by: ronnie_zheng <zl19940307@163.com>
2025-08-04 13:16:38 +08:00
parent f57d2dc162
commit fee0ab0fba
9 changed files with 415 additions and 189 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -154,8 +154,14 @@ suites = {
        TestFile("test_rope_rocm.py", 3),
        TestFile("test_awq_dequant.py", 2),
    ],
-    "per-commit-npu": [
-        TestFile("test_ascend_attention_backend.py", 400),
+    "per-commit-1-ascend-npu": [
+        TestFile("test_ascend_tp1_bf16.py", 400),
+    ],
+    "per-commit-2-ascend-npu": [
+        TestFile("test_ascend_tp2_bf16.py", 400),
+    ],
+    "per-commit-4-ascend-npu": [
+        TestFile("test_ascend_mla_w8a8int8.py", 400),
    ],
    "per-commit-2-gpu": [
        TestFile("models/lora/test_lora_tp.py", 116),
--- a/test/srt/test_ascend_attention_backend.py
+++ b/test/srt/test_ascend_attention_backend.py
@@ -1,62 +0,0 @@
-"""
-Usage:
-python3 -m unittest test_ascend_attention_backend.TestAscendAttnBackend.test_gsm8k
-"""
-
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-    run_bench_offline_throughput,
-)
-
-DEFAULT_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-7B-Instruct"
-
-
-class TestAscendAttnBackend(CustomTestCase):
-    def test_gsm8k(self):
-        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        url = urlparse(base_url)
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--attention-backend",
-                "ascend",
-                "--mem-fraction-static",
-                0.8,
-            ],
-        )
-
-        try:
-            args = SimpleNamespace(
-                num_shots=5,
-                data_path=None,
-                num_questions=1319,
-                max_new_tokens=512,
-                parallel=128,
-                host=f"http://{url.hostname}",
-                port=int(url.port),
-            )
-
-            metrics = run_eval_few_shot_gsm8k(args)
-            self.assertGreaterEqual(metrics["accuracy"], 0.62)
-            self.assertLessEqual(metrics["latency"], 150)
-        finally:
-            kill_process_tree(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_ascend_mla_backend.py
+++ b/test/srt/test_ascend_mla_backend.py
@@ -1,96 +0,0 @@
-"""
-Usage:
-python3 -m unittest test_ascend_mla_backend.TestAscendMLABackend.test_gsm8k
-"""
-
-import os
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MLA_MODEL_NAME_FOR_TEST,
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-    run_bench_offline_throughput,
-)
-
-if "ASCEND_RT_VISIBLE_DEVICES" not in os.environ:
-    os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1,2,3"
-DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
-    7000 + int(os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "0")[0]) * 100
-)
-DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
-DEFAULT_MODEL_NAME_FOR_TEST = "/models/DeepSeek-V2-Lite-Chat"
-if not os.path.exists(DEFAULT_MODEL_NAME_FOR_TEST):
-    DEFAULT_MODEL_NAME_FOR_TEST = DEFAULT_MLA_MODEL_NAME_FOR_TEST
-
-
-class TestAscendMLABackend(CustomTestCase):
-    def test_latency(self):
-        output_throughput = run_bench_offline_throughput(
-            DEFAULT_MODEL_NAME_FOR_TEST,
-            [
-                "--attention-backend",
-                "ascend",
-                "--mem-fraction-static",
-                0.7,
-                "--tp-size",
-                "4",
-                "--trust-remote-code",
-                "--disable-cuda-graph",
-            ],
-        )
-
-        print(f"{output_throughput=}")
-
-        if is_in_ci():
-            self.assertGreater(output_throughput, 18)
-
-    def test_gsm8k(self):
-        model = DEFAULT_MODEL_NAME_FOR_TEST
-        base_url = DEFAULT_URL_FOR_TEST
-        url = urlparse(base_url)
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--attention-backend",
-                "ascend",
-                "--mem-fraction-static",
-                0.7,
-                "--tp-size",
-                "4",
-                "--trust-remote-code",
-                "--disable-cuda-graph",
-            ],
-        )
-
-        try:
-            args = SimpleNamespace(
-                num_shots=5,
-                data_path=None,
-                num_questions=128,
-                max_new_tokens=512,
-                parallel=128,
-                host=f"http://{url.hostname}",
-                port=int(url.port),
-            )
-
-            metrics = run_eval_few_shot_gsm8k(args)
-            self.assertGreaterEqual(metrics["accuracy"], 0.62)
-            self.assertGreaterEqual(metrics["output_throughput"], 50)
-        finally:
-            kill_process_tree(process.pid)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_ascend_mla_w8a8int8.py
+++ b/test/srt/test_ascend_mla_w8a8int8.py
@@ -0,0 +1,100 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8": {
+        "accuracy": 0.34,
+        "latency": 1000,
+        "output_throughput": 6,
+    },
+}
+
+
+class TestAscendMlaW8A8Int8(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--quantization",
+            "w8a8_int8",
+            "--tp-size",
+            4,
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_ascend_tp1_bf16.py
+++ b/test/srt/test_ascend_tp1_bf16.py
@@ -0,0 +1,96 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 150,
+        "output_throughput": 30,
+    },
+}
+
+
+class TestAscendTp1Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_ascend_tp2_bf16.py
+++ b/test/srt/test_ascend_tp2_bf16.py
@@ -0,0 +1,98 @@
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    run_bench_offline_throughput,
+)
+
+TEST_MODEL_MATRIX = {
+    "Qwen/Qwen2.5-7B-Instruct": {
+        "accuracy": 0.85,
+        "latency": 180,
+        "output_throughput": 20,
+    },
+}
+
+
+class TestAscendTp2Bf16(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = TEST_MODEL_MATRIX.keys()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
+        cls.common_args = [
+            "--trust-remote-code",
+            "--disable-cuda-graph",
+            "--mem-fraction-static",
+            0.8,
+            "--attention-backend",
+            "ascend",
+            "--tp-size",
+            2,
+        ]
+
+    def test_a_gsm8k(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing accuracy: {model} ===##")
+
+                process = popen_launch_server(
+                    model,
+                    self.base_url,
+                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                    other_args=[
+                        *self.common_args,
+                    ],
+                )
+
+                try:
+                    args = SimpleNamespace(
+                        num_shots=5,
+                        data_path=None,
+                        num_questions=1319,
+                        max_new_tokens=512,
+                        parallel=128,
+                        host=f"http://{self.url.hostname}",
+                        port=int(self.url.port),
+                    )
+
+                    metrics = run_eval_few_shot_gsm8k(args)
+                    self.assertGreaterEqual(
+                        metrics["accuracy"],
+                        TEST_MODEL_MATRIX[model]["accuracy"],
+                    )
+                finally:
+                    kill_process_tree(process.pid)
+
+    def test_b_throughput(self):
+        for model in self.models:
+            with self.subTest(model=model):
+                print(f"##=== Testing throughput: {model} ===##")
+
+                output_throughput = run_bench_offline_throughput(
+                    model,
+                    [
+                        *self.common_args,
+                    ],
+                )
+
+                print(f"##=== {model} throughput: {output_throughput} ===##")
+
+                if is_in_ci():
+                    self.assertGreater(
+                        output_throughput,
+                        TEST_MODEL_MATRIX[model]["output_throughput"],
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()