Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)

Co-authored-by: Kan Wu <wukanustc@gmail.com>
2025-06-29 23:16:19 -07:00
parent c5131f7a2f
commit 22352d47a9
24 changed files with 626 additions and 160 deletions
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -173,10 +173,11 @@ suites = {
        # TestFile("test_deepep_intranode.py", 50),
        # TestFile("test_deepep_low_latency.py", 50),
        # TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
+        # Disabled because it hangs on the CI.
+        # TestFile("test_moe_ep.py", 181),
        TestFile("test_disaggregation.py", 270),
        TestFile("test_disaggregation_different_tp.py", 155),
        TestFile("test_full_deepseek_v3.py", 463),
-        TestFile("test_moe_ep.py", 181),
    ],
    "per-commit-8-gpu-amd": [
        TestFile("test_full_deepseek_v3.py", 250),
--- a/test/srt/test_vision_chunked_prefill.py
+++ b/test/srt/test_vision_chunked_prefill.py
@@ -178,7 +178,7 @@ class TestVisionChunkedPrefill(CustomTestCase):
            print(output_chunked)
            print("output without chunked prefill:")
            print(output_no_chunked)
-            assert output_chunked == output_no_chunked
+            self.assertEqual(output_chunked, output_no_chunked)

    def test_chunked_prefill(self):
        self._test_chunked_prefill(batches=[False, True], num_frames=[1, [2, 6, 8, 10]])