Improve profiler and integrate profiler in bench_one_batch_server (#6787)

2025-05-31 15:53:55 -07:00
parent b520d02888
commit 2d72fc47cf
25 changed files with 481 additions and 223 deletions
--- a/test/srt/test_mla_flashinfer.py
+++ b/test/srt/test_mla_flashinfer.py
@@ -57,50 +57,6 @@ class TestFlashinferMLA(CustomTestCase):
        self.assertGreater(metrics["accuracy"], 0.62)


-class TestFlashinferMLANoRagged(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "lmsys/sglang-ci-dsv3-test"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        other_args = ["--trust-remote-code"]
-        if torch.cuda.is_available() and torch.version.cuda:
-            other_args.extend(
-                [
-                    "--enable-torch-compile",
-                    "--disable-cuda-graph",
-                    "--cuda-graph-max-bs",
-                    "4",
-                    "--attention-backend",
-                    "flashinfer",
-                ]
-            )
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.62)
-
-
 class TestFlashinferMLAMTP(CustomTestCase):
    @classmethod
    def setUpClass(cls):