chore: upgrade flashinfer v0.2.6.post1 jit (#6958)

Co-authored-by: alcanderian <alcanderian@gmail.com> Co-authored-by: Qiaolin Yu <qy254@cornell.edu> Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com> Co-authored-by: Mick <mickjagger19@icloud.com> Co-authored-by: ispobock <ispobaoke@gmail.com>
2025-06-09 09:22:39 -07:00
parent 98c00a2df1
commit 56ccd3c22c
14 changed files with 189 additions and 27 deletions
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -175,7 +175,7 @@ class TestBenchServing(CustomTestCase):
    def test_vlm_online_latency(self):
        res = run_bench_serving(
            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
-            num_prompts=50,
+            num_prompts=250,
            request_rate=1,
            other_server_args=[
                "--mem-fraction-static",
@@ -194,7 +194,7 @@ class TestBenchServing(CustomTestCase):
                self.assertLess(res["median_ttft_ms"], 150)
                # TODO: not set yet, need AMD machine
            else:
-                self.assertLess(res["median_ttft_ms"], 90)
+                self.assertLess(res["median_ttft_ms"], 94)
            self.assertLess(res["median_itl_ms"], 8)

    def test_online_latency_eagle(self):