Log if cuda graph is used & extend cuda graph capture to cuda-graph-max-bs (#6201)
Co-authored-by: SangBin Cho <rkooo567@gmail.com>
This commit is contained in:
@@ -101,8 +101,8 @@ suites = {
|
||||
# TestFile("test_deepep_intranode.py", 50),
|
||||
# TestFile("test_deepep_low_latency.py", 50),
|
||||
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
|
||||
# TestFile("test_disaggregation.py", 90),
|
||||
TestFile("test_local_attn.py", 250),
|
||||
TestFile("test_disaggregation.py", 90),
|
||||
TestFile("test_full_deepseek_v3.py", 250),
|
||||
TestFile("test_pp_single_node.py", 150),
|
||||
],
|
||||
|
||||
@@ -97,7 +97,9 @@ class TestEAGLEEngine(CustomTestCase):
|
||||
|
||||
print(f"{engine.get_server_info()=}")
|
||||
|
||||
avg_spec_accept_length = engine.get_server_info()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = engine.get_server_info()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, 1.9)
|
||||
|
||||
@@ -296,7 +298,9 @@ class TestEAGLEServer(CustomTestCase):
|
||||
self.assertGreater(metrics["accuracy"], 0.20)
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info").json()
|
||||
avg_spec_accept_length = server_info["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
|
||||
speculative_eagle_topk = server_info["speculative_eagle_topk"]
|
||||
|
||||
@@ -111,7 +111,9 @@ class BaseFlashAttentionTest(CustomTestCase):
|
||||
|
||||
if self.speculative_decode:
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold)
|
||||
|
||||
|
||||
@@ -118,7 +118,9 @@ class TestDeepseekV3MTP(CustomTestCase):
|
||||
print(f"{metrics=}")
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
|
||||
if is_in_ci():
|
||||
|
||||
@@ -100,7 +100,9 @@ class TestDeepseekV3MTP(CustomTestCase):
|
||||
self.assertGreater(metrics["accuracy"], 0.60)
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, 2.5)
|
||||
|
||||
@@ -159,7 +161,9 @@ class TestDeepseekV3MTPWithDraft(CustomTestCase):
|
||||
self.assertGreater(metrics["accuracy"], 0.60)
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, 2.5)
|
||||
|
||||
|
||||
@@ -158,7 +158,9 @@ class TestFlashinferMLAMTP(CustomTestCase):
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
print(f"{server_info=}")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, 2.5)
|
||||
|
||||
|
||||
@@ -105,7 +105,9 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
|
||||
self.assertGreater(metrics["accuracy"], 0.60)
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, 2.5)
|
||||
|
||||
@@ -199,7 +201,9 @@ class TestDeepseekV3MTPBlockInt8(CustomTestCase):
|
||||
self.assertGreater(metrics["accuracy"], 0.60)
|
||||
|
||||
server_info = requests.get(self.base_url + "/get_server_info")
|
||||
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"]
|
||||
avg_spec_accept_length = server_info.json()["internal_states"][0][
|
||||
"avg_spec_accept_length"
|
||||
]
|
||||
print(f"{avg_spec_accept_length=}")
|
||||
self.assertGreater(avg_spec_accept_length, 2.5)
|
||||
|
||||
|
||||
@@ -492,9 +492,6 @@ class TestSRTEndpoint(CustomTestCase):
|
||||
max_total_num_tokens = response_json["max_total_num_tokens"]
|
||||
self.assertIsInstance(max_total_num_tokens, int)
|
||||
|
||||
attention_backend = response_json["attention_backend"]
|
||||
self.assertIsInstance(attention_backend, str)
|
||||
|
||||
version = response_json["version"]
|
||||
self.assertIsInstance(version, str)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user