Remove monkey_patch_vllm_dummy_weight_loader (#2064)
This commit is contained in:
@@ -13,7 +13,7 @@ class TestBenchLatency(unittest.TestCase):
|
||||
output_throughput = run_bench_latency(DEFAULT_MODEL_NAME_FOR_TEST, [])
|
||||
|
||||
if is_in_ci():
|
||||
assert output_throughput > 130, f"{output_throughput=}"
|
||||
self.assertGreater(output_throughput, 135)
|
||||
|
||||
def test_moe_default(self):
|
||||
output_throughput = run_bench_latency(
|
||||
@@ -21,7 +21,7 @@ class TestBenchLatency(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert output_throughput > 125, f"{output_throughput=}"
|
||||
self.assertGreater(output_throughput, 125)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -20,7 +20,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 2830
|
||||
self.assertGreater(res["output_throughput"], 2850)
|
||||
|
||||
def test_offline_throughput_non_stream_small_batch_size(self):
|
||||
res = run_bench_serving(
|
||||
@@ -35,7 +35,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 1000
|
||||
self.assertGreater(res["output_throughput"], 950)
|
||||
|
||||
def test_offline_throughput_without_radix_cache(self):
|
||||
res = run_bench_serving(
|
||||
@@ -46,7 +46,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 2880
|
||||
self.assertGreater(res["output_throughput"], 2900)
|
||||
|
||||
def test_offline_throughput_without_chunked_prefill(self):
|
||||
res = run_bench_serving(
|
||||
@@ -57,7 +57,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 2600
|
||||
self.assertGreater(res["output_throughput"], 2600)
|
||||
|
||||
def test_offline_throughput_with_triton_attention_backend(self):
|
||||
res = run_bench_serving(
|
||||
@@ -73,7 +73,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 2930
|
||||
self.assertGreater(res["output_throughput"], 2950)
|
||||
|
||||
def test_offline_throughput_default_fp8(self):
|
||||
res = run_bench_serving(
|
||||
@@ -84,7 +84,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 3100
|
||||
self.assertGreater(res["output_throughput"], 3200)
|
||||
|
||||
def test_online_latency_default(self):
|
||||
res = run_bench_serving(
|
||||
@@ -95,9 +95,9 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["median_e2e_latency_ms"] < 12000
|
||||
assert res["median_ttft_ms"] < 80
|
||||
assert res["median_itl_ms"] < 12
|
||||
self.assertLess(res["median_e2e_latency_ms"], 12000)
|
||||
self.assertLess(res["median_ttft_ms"], 80)
|
||||
self.assertLess(res["median_itl_ms"], 11)
|
||||
|
||||
def test_moe_offline_throughput_default(self):
|
||||
res = run_bench_serving(
|
||||
@@ -108,7 +108,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 1850
|
||||
self.assertGreater(res["output_throughput"], 1900)
|
||||
|
||||
def test_moe_offline_throughput_without_radix_cache(self):
|
||||
res = run_bench_serving(
|
||||
@@ -119,7 +119,7 @@ class TestBenchServing(unittest.TestCase):
|
||||
)
|
||||
|
||||
if is_in_ci():
|
||||
assert res["output_throughput"] > 1950
|
||||
self.assertGreater(res["output_throughput"], 1950)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user