Revert "[feature] Ascend NPU graph support (#8027)" (#9348)

2025-08-20 01:11:23 +08:00
parent 01d47a27b6
commit f4fafacc5d
18 changed files with 878 additions and 1349 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -229,17 +229,6 @@ suite_amd = {
        TestFile("test_wave_attention_kernels.py", 2),
        TestFile("test_wave_attention_backend.py", 150),
    ],
-    "per-commit-1-ascend-npu": [
-        TestFile("test_ascend_tp1_bf16.py", 400),
-        TestFile("test_ascend_graph_tp1_bf16.py", 400),
-    ],
-    "per-commit-2-ascend-npu": [
-        TestFile("test_ascend_tp2_bf16.py", 400),
-        TestFile("test_ascend_graph_tp2_bf16.py", 400),
-    ],
-    "per-commit-4-ascend-npu": [
-        TestFile("test_ascend_mla_w8a8int8.py", 400),
-    ],
    "per-commit-2-gpu-amd": [
        TestFile("lora/test_lora_tp.py", 116),
        TestFile("rl/test_update_weights_from_distributed.py", 103),
--- a/test/srt/test_ascend_graph_tp1_bf16.py
+++ b/test/srt/test_ascend_graph_tp1_bf16.py
@@ -1,95 +0,0 @@
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-    run_bench_offline_throughput,
-)
-
-TEST_MODEL_MATRIX = {
-    "Qwen/Qwen2.5-7B-Instruct": {
-        "accuracy": 0.85,
-        "latency": 150,
-        "output_throughput": 30,
-    },
-}
-
-
-class TestAscendGraphTp1Bf16(CustomTestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        cls.models = TEST_MODEL_MATRIX.keys()
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
-        cls.common_args = [
-            "--trust-remote-code",
-            "--mem-fraction-static",
-            0.8,
-            "--attention-backend",
-            "ascend",
-        ]
-
-    def test_a_gsm8k(self):
-        for model in self.models:
-            with self.subTest(model=model):
-                print(f"##=== Testing accuracy: {model} ===##")
-
-                process = popen_launch_server(
-                    model,
-                    self.base_url,
-                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-                    other_args=[
-                        *self.common_args,
-                    ],
-                )
-
-                try:
-                    args = SimpleNamespace(
-                        num_shots=5,
-                        data_path=None,
-                        num_questions=1319,
-                        max_new_tokens=512,
-                        parallel=128,
-                        host=f"http://{self.url.hostname}",
-                        port=int(self.url.port),
-                    )
-
-                    metrics = run_eval_few_shot_gsm8k(args)
-                    self.assertGreaterEqual(
-                        metrics["accuracy"],
-                        TEST_MODEL_MATRIX[model]["accuracy"],
-                    )
-                finally:
-                    kill_process_tree(process.pid)
-
-    def test_b_throughput(self):
-        for model in self.models:
-            with self.subTest(model=model):
-                print(f"##=== Testing throughput: {model} ===##")
-
-                output_throughput = run_bench_offline_throughput(
-                    model,
-                    [
-                        *self.common_args,
-                    ],
-                )
-
-                print(f"##=== {model} throughput: {output_throughput} ===##")
-
-                if is_in_ci():
-                    self.assertGreater(
-                        output_throughput,
-                        TEST_MODEL_MATRIX[model]["output_throughput"],
-                    )
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_ascend_graph_tp2_bf16.py
+++ b/test/srt/test_ascend_graph_tp2_bf16.py
@@ -1,97 +0,0 @@
-import unittest
-from types import SimpleNamespace
-from urllib.parse import urlparse
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
-from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    CustomTestCase,
-    is_in_ci,
-    popen_launch_server,
-    run_bench_offline_throughput,
-)
-
-TEST_MODEL_MATRIX = {
-    "Qwen/Qwen2.5-7B-Instruct": {
-        "accuracy": 0.85,
-        "latency": 180,
-        "output_throughput": 20,
-    },
-}
-
-
-class TestAscendGraphTp2Bf16(CustomTestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        cls.models = TEST_MODEL_MATRIX.keys()
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.url = urlparse(DEFAULT_URL_FOR_TEST)
-        cls.common_args = [
-            "--trust-remote-code",
-            "--mem-fraction-static",
-            0.8,
-            "--attention-backend",
-            "ascend",
-            "--tp-size",
-            2,
-        ]
-
-    def test_a_gsm8k(self):
-        for model in self.models:
-            with self.subTest(model=model):
-                print(f"##=== Testing accuracy: {model} ===##")
-
-                process = popen_launch_server(
-                    model,
-                    self.base_url,
-                    timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-                    other_args=[
-                        *self.common_args,
-                    ],
-                )
-
-                try:
-                    args = SimpleNamespace(
-                        num_shots=5,
-                        data_path=None,
-                        num_questions=1319,
-                        max_new_tokens=512,
-                        parallel=128,
-                        host=f"http://{self.url.hostname}",
-                        port=int(self.url.port),
-                    )
-
-                    metrics = run_eval_few_shot_gsm8k(args)
-                    self.assertGreaterEqual(
-                        metrics["accuracy"],
-                        TEST_MODEL_MATRIX[model]["accuracy"],
-                    )
-                finally:
-                    kill_process_tree(process.pid)
-
-    def test_b_throughput(self):
-        for model in self.models:
-            with self.subTest(model=model):
-                print(f"##=== Testing throughput: {model} ===##")
-
-                output_throughput = run_bench_offline_throughput(
-                    model,
-                    [
-                        *self.common_args,
-                    ],
-                )
-
-                print(f"##=== {model} throughput: {output_throughput} ===##")
-
-                if is_in_ci():
-                    self.assertGreater(
-                        output_throughput,
-                        TEST_MODEL_MATRIX[model]["output_throughput"],
-                    )
-
-
-if __name__ == "__main__":
-    unittest.main()